From ee615ac26916f15290b63a20d4feb07ea702d5da Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 5 Jul 2025 11:34:57 -0700 Subject: [PATCH 001/759] Add warning when loading file unsafely. (#8800) --- comfy/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/utils.py b/comfy/utils.py index 1f8d71292..47981d8f6 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -77,6 +77,7 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False): if safe_load or ALWAYS_SAFE_LOAD: pl_sd = torch.load(ckpt, map_location=device, weights_only=True, **torch_args) else: + logging.warning("WARNING: loading {} unsafely, upgrade your pytorch to 2.4 or newer to load this file safely.".format(ckpt)) pl_sd = torch.load(ckpt, map_location=device, pickle_module=comfy.checkpoint_pickle) if "state_dict" in pl_sd: sd = pl_sd["state_dict"] From 75d327abd5ee8fb6a1ec992ffdb8e43926574a63 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 6 Jul 2025 04:07:39 -0700 Subject: [PATCH 002/759] Remove some useless code. (#8812) --- comfy/gligen.py | 47 +---------------------------------------------- 1 file changed, 1 insertion(+), 46 deletions(-) diff --git a/comfy/gligen.py b/comfy/gligen.py index 161d8a5e5..1d7b6c2f4 100644 --- a/comfy/gligen.py +++ b/comfy/gligen.py @@ -1,55 +1,10 @@ import math import torch from torch import nn -from .ldm.modules.attention import CrossAttention -from inspect import isfunction +from .ldm.modules.attention import CrossAttention, FeedForward import comfy.ops ops = comfy.ops.manual_cast -def exists(val): - return val is not None - - -def uniq(arr): - return{el: True for el in arr}.keys() - - -def default(val, d): - if exists(val): - return val - return d() if isfunction(d) else d - - -# feedforward -class GEGLU(nn.Module): - def __init__(self, dim_in, dim_out): - super().__init__() - self.proj = ops.Linear(dim_in, dim_out * 2) - - def forward(self, x): - x, gate = self.proj(x).chunk(2, dim=-1) - return x * torch.nn.functional.gelu(gate) - - -class FeedForward(nn.Module): - def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.): - super().__init__() - inner_dim = int(dim * mult) - dim_out = default(dim_out, dim) - project_in = nn.Sequential( - ops.Linear(dim, inner_dim), - nn.GELU() - ) if not glu else GEGLU(dim, inner_dim) - - self.net = nn.Sequential( - project_in, - nn.Dropout(dropout), - ops.Linear(inner_dim, dim_out) - ) - - def forward(self, x): - return self.net(x) - class GatedCrossAttentionDense(nn.Module): def __init__(self, query_dim, context_dim, n_heads, d_head): From 7eab7d29447429645c4c27a7e6c85d2be8ed79ff Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 6 Jul 2025 11:01:32 -0700 Subject: [PATCH 003/759] Remove dependency on deprecated torchaudio.save function (#8815) --- comfy_extras/nodes_audio.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 49af1eae4..aca09e4cc 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -133,14 +133,6 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non if sample_rate != audio["sample_rate"]: waveform = torchaudio.functional.resample(waveform, audio["sample_rate"], sample_rate) - # Create in-memory WAV buffer - wav_buffer = io.BytesIO() - torchaudio.save(wav_buffer, waveform, sample_rate, format="WAV") - wav_buffer.seek(0) # Rewind for reading - - # Use PyAV to convert and add metadata - input_container = av.open(wav_buffer) - # Create output with specified format output_buffer = io.BytesIO() output_container = av.open(output_buffer, mode='w', format=format) @@ -150,7 +142,6 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non output_container.metadata[key] = value # Set up the output stream with appropriate properties - input_container.streams.audio[0] if format == "opus": out_stream = output_container.add_stream("libopus", rate=sample_rate) if quality == "64k": @@ -175,18 +166,15 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non else: #format == "flac": out_stream = output_container.add_stream("flac", rate=sample_rate) - - # Copy frames from input to output - for frame in input_container.decode(audio=0): - frame.pts = None # Let PyAV handle timestamps - output_container.mux(out_stream.encode(frame)) + frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[0] == 1 else 'stereo') + frame.sample_rate = sample_rate + output_container.mux(out_stream.encode(frame)) # Flush encoder output_container.mux(out_stream.encode(None)) # Close containers output_container.close() - input_container.close() # Write the output to file output_buffer.seek(0) From e740dfd8061e696c9701c917e05649eca9b4f631 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 7 Jul 2025 00:16:00 -0700 Subject: [PATCH 004/759] Fix warning in audio save nodes. (#8818) --- comfy_extras/nodes_audio.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index aca09e4cc..8cd647846 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -168,6 +168,7 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[0] == 1 else 'stereo') frame.sample_rate = sample_rate + frame.pts = 0 output_container.mux(out_stream.encode(frame)) # Flush encoder From 059cd38aa21aa6b91ca117a92218532fcd6e9c75 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 8 Jul 2025 08:43:56 +0800 Subject: [PATCH 005/759] Update template and node docs package version (#8825) --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 27d385389..03e29ab44 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.32 -comfyui-embedded-docs==0.2.3 +comfyui-workflow-templates==0.1.33 +comfyui-embedded-docs==0.2.4 torch torchsde torchvision From 1359c969e4962c2373eaccdf6fdcc9214c6957c2 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Mon, 7 Jul 2025 20:35:41 -0700 Subject: [PATCH 006/759] Update template to 0.1.34 (#8829) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 03e29ab44..eb51479f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.33 +comfyui-workflow-templates==0.1.34 comfyui-embedded-docs==0.2.4 torch torchsde From b5e97db9ac69637dc0e29d3e6259bb2a333a0b27 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 8 Jul 2025 20:52:02 +0800 Subject: [PATCH 007/759] Update template to 0.1.35 (#8831) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index eb51479f6..19a40ca0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.34 +comfyui-workflow-templates==0.1.35 comfyui-embedded-docs==0.2.4 torch torchsde From 9fd0cd7cf7534d44624457e37b0902b2cd6cb04f Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Tue, 8 Jul 2025 05:54:30 -0700 Subject: [PATCH 008/759] Add Moonvalley nodes (#8832) --- comfy_api/input/video_types.py | 19 +- comfy_api/input_impl/video_types.py | 9 + comfy_api_nodes/apis/__init__.py | 171 +++++++- comfy_api_nodes/nodes_moonvalley.py | 639 ++++++++++++++++++++++++++++ nodes.py | 1 + 5 files changed, 837 insertions(+), 2 deletions(-) create mode 100644 comfy_api_nodes/nodes_moonvalley.py diff --git a/comfy_api/input/video_types.py b/comfy_api/input/video_types.py index dc22d34ff..bb936e0a4 100644 --- a/comfy_api/input/video_types.py +++ b/comfy_api/input/video_types.py @@ -1,6 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Optional +from typing import Optional, Union +import io from comfy_api.util import VideoContainer, VideoCodec, VideoComponents class VideoInput(ABC): @@ -31,6 +32,22 @@ class VideoInput(ABC): """ pass + def get_stream_source(self) -> Union[str, io.BytesIO]: + """ + Get a streamable source for the video. This allows processing without + loading the entire video into memory. + + Returns: + Either a file path (str) or a BytesIO object that can be opened with av. + + Default implementation creates a BytesIO buffer, but subclasses should + override this for better performance when possible. + """ + buffer = io.BytesIO() + self.save_to(buffer) + buffer.seek(0) + return buffer + # Provide a default implementation, but subclasses can provide optimized versions # if possible. def get_dimensions(self) -> tuple[int, int]: diff --git a/comfy_api/input_impl/video_types.py b/comfy_api/input_impl/video_types.py index 197f6558c..9ae818f4e 100644 --- a/comfy_api/input_impl/video_types.py +++ b/comfy_api/input_impl/video_types.py @@ -64,6 +64,15 @@ class VideoFromFile(VideoInput): """ self.__file = file + def get_stream_source(self) -> str | io.BytesIO: + """ + Return the underlying file source for efficient streaming. + This avoids unnecessary memory copies when the source is already a file path. + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) + return self.__file + def get_dimensions(self) -> tuple[int, int]: """ Returns the dimensions of the video input. diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index e38d38cc9..086028abe 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: filtered-openapi.yaml -# timestamp: 2025-05-19T21:38:55+00:00 +# timestamp: 2025-07-06T09:47:31+00:00 from __future__ import annotations @@ -1355,6 +1355,158 @@ class ModelResponseProperties(BaseModel): ) +class Keyframes(BaseModel): + image_url: Optional[str] = None + + +class MoonvalleyPromptResponse(BaseModel): + error: Optional[Dict[str, Any]] = None + frame_conditioning: Optional[Dict[str, Any]] = None + id: Optional[str] = None + inference_params: Optional[Dict[str, Any]] = None + meta: Optional[Dict[str, Any]] = None + model_params: Optional[Dict[str, Any]] = None + output_url: Optional[str] = None + prompt_text: Optional[str] = None + status: Optional[str] = None + + +class MoonvalleyTextToVideoInferenceParams(BaseModel): + add_quality_guidance: Optional[bool] = Field( + True, description='Whether to add quality guidance' + ) + caching_coefficient: Optional[float] = Field( + 0.3, description='Caching coefficient for optimization' + ) + caching_cooldown: Optional[int] = Field( + 3, description='Number of caching cooldown steps' + ) + caching_warmup: Optional[int] = Field( + 3, description='Number of caching warmup steps' + ) + clip_value: Optional[float] = Field( + 3, description='CLIP value for generation control' + ) + conditioning_frame_index: Optional[int] = Field( + 0, description='Index of the conditioning frame' + ) + cooldown_steps: Optional[int] = Field( + None, description='Number of cooldown steps (calculated based on num_frames)' + ) + fps: Optional[int] = Field( + 24, description='Frames per second of the generated video' + ) + guidance_scale: Optional[float] = Field( + 12.5, description='Guidance scale for generation control' + ) + height: Optional[int] = Field( + 1080, description='Height of the generated video in pixels' + ) + negative_prompt: Optional[str] = Field(None, description='Negative prompt text') + num_frames: Optional[int] = Field(64, description='Number of frames to generate') + seed: Optional[int] = Field( + None, description='Random seed for generation (default: random)' + ) + shift_value: Optional[float] = Field( + 3, description='Shift value for generation control' + ) + steps: Optional[int] = Field(80, description='Number of denoising steps') + use_guidance_schedule: Optional[bool] = Field( + True, description='Whether to use guidance scheduling' + ) + use_negative_prompts: Optional[bool] = Field( + False, description='Whether to use negative prompts' + ) + use_timestep_transform: Optional[bool] = Field( + True, description='Whether to use timestep transformation' + ) + warmup_steps: Optional[int] = Field( + None, description='Number of warmup steps (calculated based on num_frames)' + ) + width: Optional[int] = Field( + 1920, description='Width of the generated video in pixels' + ) + + +class MoonvalleyTextToVideoRequest(BaseModel): + image_url: Optional[str] = None + inference_params: Optional[MoonvalleyTextToVideoInferenceParams] = None + prompt_text: Optional[str] = None + webhook_url: Optional[str] = None + + +class MoonvalleyUploadFileRequest(BaseModel): + file: Optional[StrictBytes] = None + + +class MoonvalleyUploadFileResponse(BaseModel): + access_url: Optional[str] = None + + +class MoonvalleyVideoToVideoInferenceParams(BaseModel): + add_quality_guidance: Optional[bool] = Field( + True, description='Whether to add quality guidance' + ) + caching_coefficient: Optional[float] = Field( + 0.3, description='Caching coefficient for optimization' + ) + caching_cooldown: Optional[int] = Field( + 3, description='Number of caching cooldown steps' + ) + caching_warmup: Optional[int] = Field( + 3, description='Number of caching warmup steps' + ) + clip_value: Optional[float] = Field( + 3, description='CLIP value for generation control' + ) + conditioning_frame_index: Optional[int] = Field( + 0, description='Index of the conditioning frame' + ) + cooldown_steps: Optional[int] = Field( + None, description='Number of cooldown steps (calculated based on num_frames)' + ) + guidance_scale: Optional[float] = Field( + 12.5, description='Guidance scale for generation control' + ) + negative_prompt: Optional[str] = Field(None, description='Negative prompt text') + seed: Optional[int] = Field( + None, description='Random seed for generation (default: random)' + ) + shift_value: Optional[float] = Field( + 3, description='Shift value for generation control' + ) + steps: Optional[int] = Field(80, description='Number of denoising steps') + use_guidance_schedule: Optional[bool] = Field( + True, description='Whether to use guidance scheduling' + ) + use_negative_prompts: Optional[bool] = Field( + False, description='Whether to use negative prompts' + ) + use_timestep_transform: Optional[bool] = Field( + True, description='Whether to use timestep transformation' + ) + warmup_steps: Optional[int] = Field( + None, description='Number of warmup steps (calculated based on num_frames)' + ) + + +class ControlType(str, Enum): + motion_control = 'motion_control' + pose_control = 'pose_control' + + +class MoonvalleyVideoToVideoRequest(BaseModel): + control_type: ControlType = Field( + ..., description='Supported types for video control' + ) + inference_params: Optional[MoonvalleyVideoToVideoInferenceParams] = None + prompt_text: str = Field(..., description='Describes the video to generate') + video_url: str = Field(..., description='Url to control video') + webhook_url: Optional[str] = Field( + None, description='Optional webhook URL for notifications' + ) + + class Moderation(str, Enum): low = 'low' auto = 'auto' @@ -3107,6 +3259,23 @@ class LumaUpscaleVideoGenerationRequest(BaseModel): resolution: Optional[LumaVideoModelOutputResolution] = None +class MoonvalleyImageToVideoRequest(MoonvalleyTextToVideoRequest): + keyframes: Optional[Dict[str, Keyframes]] = None + + +class MoonvalleyResizeVideoRequest(MoonvalleyVideoToVideoRequest): + frame_position: Optional[List[int]] = Field(None, max_length=2, min_length=2) + frame_resolution: Optional[List[int]] = Field(None, max_length=2, min_length=2) + scale: Optional[List[int]] = Field(None, max_length=2, min_length=2) + + +class MoonvalleyTextToImageRequest(BaseModel): + image_url: Optional[str] = None + inference_params: Optional[MoonvalleyTextToVideoInferenceParams] = None + prompt_text: Optional[str] = None + webhook_url: Optional[str] = None + + class OutputContent(RootModel[Union[OutputTextContent, OutputAudioContent]]): root: Union[OutputTextContent, OutputAudioContent] diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py new file mode 100644 index 000000000..6e937411c --- /dev/null +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -0,0 +1,639 @@ +import logging +from typing import Any, Callable, Optional, TypeVar +import random +import torch +from comfy_api_nodes.util.validation_utils import get_image_dimensions, validate_image_dimensions, validate_video_dimensions + + +from comfy_api_nodes.apis import ( + MoonvalleyTextToVideoRequest, + MoonvalleyTextToVideoInferenceParams, + MoonvalleyVideoToVideoInferenceParams, + MoonvalleyVideoToVideoRequest, + MoonvalleyPromptResponse +) +from comfy_api_nodes.apis.client import ( + ApiEndpoint, + HttpMethod, + SynchronousOperation, + PollingOperation, + EmptyRequest, +) +from comfy_api_nodes.apinode_utils import ( + download_url_to_video_output, + upload_images_to_comfyapi, + upload_video_to_comfyapi, +) +from comfy_api_nodes.mapper_utils import model_field_to_node_input + +from comfy_api.input.video_types import VideoInput +from comfy.comfy_types.node_typing import IO +from comfy_api.input_impl import VideoFromFile +import av +import io + +API_UPLOADS_ENDPOINT = "/proxy/moonvalley/uploads" +API_PROMPTS_ENDPOINT = "/proxy/moonvalley/prompts" +API_VIDEO2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/video-to-video" +API_TXT2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/text-to-video" +API_IMG2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/image-to-video" + +MIN_WIDTH = 300 +MIN_HEIGHT = 300 + +MAX_WIDTH = 10000 +MAX_HEIGHT = 10000 + +MIN_VID_WIDTH = 300 +MIN_VID_HEIGHT = 300 + +MAX_VID_WIDTH = 10000 +MAX_VID_HEIGHT = 10000 + +MAX_VIDEO_SIZE = 1024 * 1024 * 1024 # 1 GB max for in-memory video processing + +MOONVALLEY_MAREY_MAX_PROMPT_LENGTH = 5000 +R = TypeVar("R") +class MoonvalleyApiError(Exception): + """Base exception for Moonvalley API errors.""" + pass + +def is_valid_task_creation_response(response: MoonvalleyPromptResponse) -> bool: + """Verifies that the initial response contains a task ID.""" + return bool(response.id) + +def validate_task_creation_response(response) -> None: + if not is_valid_task_creation_response(response): + error_msg = f"Moonvalley Marey API: Initial request failed. Code: {response.code}, Message: {response.message}, Data: {response}" + logging.error(error_msg) + raise MoonvalleyApiError(error_msg) + +def get_video_from_response(response): + video = response.output_url + logging.info( + "Moonvalley Marey API: Task %s succeeded. Video URL: %s", response.id, video + ) + return video + + +def get_video_url_from_response(response) -> Optional[str]: + """Returns the first video url from the Moonvalley video generation task result. + Will not raise an error if the response is not valid. + """ + if response: + return str(get_video_from_response(response)) + else: + return None + + +def poll_until_finished( + auth_kwargs: dict[str, str], + api_endpoint: ApiEndpoint[Any, R], + result_url_extractor: Optional[Callable[[R], str]] = None, + node_id: Optional[str] = None, +) -> R: + """Polls the Moonvalley API endpoint until the task reaches a terminal state, then returns the response.""" + return PollingOperation( + poll_endpoint=api_endpoint, + completed_statuses=[ + "completed", + ], + max_poll_attempts=240, # 64 minutes with 16s interval + poll_interval=16.0, + failed_statuses=["error"], + status_extractor=lambda response: ( + response.status + if response and response.status + else None + ), + auth_kwargs=auth_kwargs, + result_url_extractor=result_url_extractor, + node_id=node_id, + ).execute() + +def validate_prompts(prompt:str, negative_prompt: str, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH): + """Verifies that the prompt isn't empty and that neither prompt is too long.""" + if not prompt: + raise ValueError("Positive prompt is empty") + if len(prompt) > max_length: + raise ValueError(f"Positive prompt is too long: {len(prompt)} characters") + if negative_prompt and len(negative_prompt) > max_length: + raise ValueError( + f"Negative prompt is too long: {len(negative_prompt)} characters" + ) + return True + +def validate_input_media(width, height, with_frame_conditioning, num_frames_in=None): + # inference validation + # T = num_frames + # in all cases, the following must be true: T divisible by 16 and H,W by 8. in addition... + # with image conditioning: H*W must be divisible by 8192 + # without image conditioning: T divisible by 32 + if num_frames_in and not num_frames_in % 16 == 0 : + return False, ( + "The input video total frame count must be divisible by 16!" + ) + + if height % 8 != 0 or width % 8 != 0: + return False, ( + f"Height ({height}) and width ({width}) must be " "divisible by 8" + ) + + if with_frame_conditioning: + if (height * width) % 8192 != 0: + return False, ( + f"Height * width ({height * width}) must be " + "divisible by 8192 for frame conditioning" + ) + else: + if num_frames_in and not num_frames_in % 32 == 0 : + return False, ( + "The input video total frame count must be divisible by 32!" + ) + + +def validate_input_image(image: torch.Tensor, with_frame_conditioning: bool=False) -> None: + """ + Validates the input image adheres to the expectations of the API: + - The image resolution should not be less than 300*300px + - The aspect ratio of the image should be between 1:2.5 ~ 2.5:1 + + """ + height, width = get_image_dimensions(image) + validate_input_media(width, height, with_frame_conditioning ) + validate_image_dimensions(image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH) + +def validate_input_video(video: VideoInput, num_frames_out: int, with_frame_conditioning: bool=False): + try: + width, height = video.get_dimensions() + except Exception as e: + logging.error("Error getting dimensions of video: %s", e) + raise ValueError(f"Cannot get video dimensions: {e}") from e + + validate_input_media(width, height, with_frame_conditioning) + validate_video_dimensions(video, min_width=MIN_VID_WIDTH, min_height=MIN_VID_HEIGHT, max_width=MAX_VID_WIDTH, max_height=MAX_VID_HEIGHT) + + trimmed_video = validate_input_video_length(video, num_frames_out) + return trimmed_video + + +def validate_input_video_length(video: VideoInput, num_frames: int): + + if video.get_duration() > 60: + raise MoonvalleyApiError("Input Video lenth should be less than 1min. Please trim.") + + if num_frames == 128: + if video.get_duration() < 5: + raise MoonvalleyApiError("Input Video length is less than 5s. Please use a video longer than or equal to 5s.") + if video.get_duration() > 5: + # trim video to 5s + video = trim_video(video, 5) + if num_frames == 256: + if video.get_duration() < 10: + raise MoonvalleyApiError("Input Video length is less than 10s. Please use a video longer than or equal to 10s.") + if video.get_duration() > 10: + # trim video to 10s + video = trim_video(video, 10) + return video + +def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: + """ + Returns a new VideoInput object trimmed from the beginning to the specified duration, + using av to avoid loading entire video into memory. + + Args: + video: Input video to trim + duration_sec: Duration in seconds to keep from the beginning + + Returns: + VideoFromFile object that owns the output buffer + """ + output_buffer = io.BytesIO() + + input_container = None + output_container = None + + try: + # Get the stream source - this avoids loading entire video into memory + # when the source is already a file path + input_source = video.get_stream_source() + + # Open containers + input_container = av.open(input_source, mode='r') + output_container = av.open(output_buffer, mode='w', format='mp4') + + # Set up output streams for re-encoding + video_stream = None + audio_stream = None + + for stream in input_container.streams: + logging.info(f"Found stream: type={stream.type}, class={type(stream)}") + if isinstance(stream, av.VideoStream): + # Create output video stream with same parameters + video_stream = output_container.add_stream('h264', rate=stream.average_rate) + video_stream.width = stream.width + video_stream.height = stream.height + video_stream.pix_fmt = 'yuv420p' + logging.info(f"Added video stream: {stream.width}x{stream.height} @ {stream.average_rate}fps") + elif isinstance(stream, av.AudioStream): + # Create output audio stream with same parameters + audio_stream = output_container.add_stream('aac', rate=stream.sample_rate) + audio_stream.sample_rate = stream.sample_rate + audio_stream.layout = stream.layout + logging.info(f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels") + + # Calculate target frame count that's divisible by 32 + fps = input_container.streams.video[0].average_rate + estimated_frames = int(duration_sec * fps) + target_frames = (estimated_frames // 32) * 32 # Round down to nearest multiple of 32 + + if target_frames == 0: + raise ValueError("Video too short: need at least 32 frames for Moonvalley") + + frame_count = 0 + audio_frame_count = 0 + + # Decode and re-encode video frames + if video_stream: + for frame in input_container.decode(video=0): + if frame_count >= target_frames: + break + + # Re-encode frame + for packet in video_stream.encode(frame): + output_container.mux(packet) + frame_count += 1 + + # Flush encoder + for packet in video_stream.encode(): + output_container.mux(packet) + + logging.info(f"Encoded {frame_count} video frames (target: {target_frames})") + + # Decode and re-encode audio frames + if audio_stream: + input_container.seek(0) # Reset to beginning for audio + for frame in input_container.decode(audio=0): + if frame.time >= duration_sec: + break + + # Re-encode frame + for packet in audio_stream.encode(frame): + output_container.mux(packet) + audio_frame_count += 1 + + # Flush encoder + for packet in audio_stream.encode(): + output_container.mux(packet) + + logging.info(f"Encoded {audio_frame_count} audio frames") + + # Close containers + output_container.close() + input_container.close() + + + # Return as VideoFromFile using the buffer + output_buffer.seek(0) + return VideoFromFile(output_buffer) + + except Exception as e: + # Clean up on error + if input_container is not None: + input_container.close() + if output_container is not None: + output_container.close() + raise RuntimeError(f"Failed to trim video: {str(e)}") from e + +# --- BaseMoonvalleyVideoNode --- +class BaseMoonvalleyVideoNode: + def parseWidthHeightFromRes(self, resolution: str): + # Accepts a string like "16:9 (1920 x 1080)" and returns width, height as a dict + res_map = { + "16:9 (1920 x 1080)": {"width": 1920, "height": 1080}, + "9:16 (1080 x 1920)": {"width": 1080, "height": 1920}, + "1:1 (1152 x 1152)": {"width": 1152, "height": 1152}, + "4:3 (1440 x 1080)": {"width": 1440, "height": 1080}, + "3:4 (1080 x 1440)": {"width": 1080, "height": 1440}, + "21:9 (2560 x 1080)": {"width": 2560, "height": 1080}, + } + if resolution in res_map: + return res_map[resolution] + else: + # Default to 1920x1080 if unknown + return {"width": 1920, "height": 1080} + + def parseControlParameter(self, value): + control_map = { + "Motion Transfer": "motion_control", + "Canny": "canny_control", + "Pose Transfer": "pose_control", + "Depth": "depth_control" + } + if value in control_map: + return control_map[value] + else: + return control_map["Motion Transfer"] + + def get_response( + self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None + ) -> MoonvalleyPromptResponse: + return poll_until_finished( + auth_kwargs, + ApiEndpoint( + path=f"{API_PROMPTS_ENDPOINT}/{task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=MoonvalleyPromptResponse, + ), + result_url_extractor=get_video_url_from_response, + node_id=node_id, + ) + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "prompt": model_field_to_node_input( + IO.STRING, MoonvalleyTextToVideoRequest, "prompt_text", + multiline=True + ), + "negative_prompt": model_field_to_node_input( + IO.STRING, + MoonvalleyTextToVideoInferenceParams, + "negative_prompt", + multiline=True, + default="gopro, bright, contrast, static, overexposed, bright, vignette, artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, flare, saturation, distorted, warped, wide angle, contrast, saturated, vibrant, glowing, cross dissolve, texture, videogame, saturation, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, blown out, horrible, blurry, worst quality, bad, transition, dissolve, cross-dissolve, melt, fade in, fade out, wobbly, weird, low quality, plastic, stock footage, video camera, boring, static", + ), + + "resolution": (IO.COMBO, { + "options": ["16:9 (1920 x 1080)", + "9:16 (1080 x 1920)", + "1:1 (1152 x 1152)", + "4:3 (1440 x 1080)", + "3:4 (1080 x 1440)", + "21:9 (2560 x 1080)"], + "default": "16:9 (1920 x 1080)", + "tooltip": "Resolution of the output video", + }), + # "length": (IO.COMBO,{"options":['5s','10s'], "default": '5s'}), + "prompt_adherence": model_field_to_node_input(IO.FLOAT,MoonvalleyTextToVideoInferenceParams,"guidance_scale",default=7.0, step=1, min=1, max=20), + "seed": model_field_to_node_input(IO.INT,MoonvalleyTextToVideoInferenceParams, "seed", default=random.randint(0, 2**32 - 1), min=0, max=4294967295, step=1, display="number", tooltip="Random seed value", control_after_generate=True), + "steps": model_field_to_node_input(IO.INT, MoonvalleyTextToVideoInferenceParams, "steps", default=100, min=1, max=100), + }, + "hidden": { + "auth_token": "AUTH_TOKEN_COMFY_ORG", + "comfy_api_key": "API_KEY_COMFY_ORG", + "unique_id": "UNIQUE_ID", + }, + "optional": { + "image": model_field_to_node_input( + IO.IMAGE, + MoonvalleyTextToVideoRequest, + "image_url", + tooltip="The reference image used to generate the video", + ), + } + } + + RETURN_TYPES = ("STRING",) + FUNCTION = "generate" + CATEGORY = "api node/video/Moonvalley Marey" + API_NODE = True + + def generate(self, **kwargs): + return None + +# --- MoonvalleyImg2VideoNode --- +class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): + + @classmethod + def INPUT_TYPES(cls): + return super().INPUT_TYPES() + + RETURN_TYPES = ("VIDEO",) + RETURN_NAMES = ("video",) + DESCRIPTION = "Moonvalley Marey Image to Video Node" + + def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs): + image = kwargs.get("image", None) + if (image is None): + raise MoonvalleyApiError("image is required") + total_frames = get_total_frames_from_length() + + validate_input_image(image,True) + validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) + + inference_params=MoonvalleyTextToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + num_frames=total_frames, + width=width_height.get("width"), + height=width_height.get("height"), + use_negative_prompts=True + ) + """Upload image to comfy backend to have a URL available for further processing""" + # Get MIME type from tensor - assuming PNG format for image tensors + mime_type = "image/png" + + image_url = upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type)[0] + + request = MoonvalleyTextToVideoRequest( + image_url=image_url, + prompt_text=prompt, + inference_params=inference_params + ) + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint(path=API_IMG2VIDEO_ENDPOINT, + method=HttpMethod.POST, + request_model=MoonvalleyTextToVideoRequest, + response_model=MoonvalleyPromptResponse + ), + request=request, + auth_kwargs=kwargs, + ) + task_creation_response = initial_operation.execute() + validate_task_creation_response(task_creation_response) + task_id = task_creation_response.id + + final_response = self.get_response( + task_id, auth_kwargs=kwargs, node_id=unique_id + ) + video = download_url_to_video_output(final_response.output_url) + return (video, ) + +# --- MoonvalleyVid2VidNode --- +class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): + def __init__(self): + super().__init__() + + @classmethod + def INPUT_TYPES(cls): + input_types = super().INPUT_TYPES() + for param in ["resolution", "image"]: + if param in input_types["required"]: + del input_types["required"][param] + if param in input_types["optional"]: + del input_types["optional"][param] + input_types["optional"] = { + "video": (IO.VIDEO, {"default": "", "multiline": False, "tooltip": "The reference video used to generate the output video. Input a 5s video for 128 frames and a 10s video for 256 frames. Longer videos will be trimmed automatically."}), + "control_type": ( + ["Motion Transfer", "Pose Transfer"], + {"default": "Motion Transfer"}, + ), + "motion_intensity": ( + "INT", + { + "default": 100, + "step": 1, + "min": 0, + "max": 100, + "tooltip": "Only used if control_type is 'Motion Transfer'", + }, + ) + } + + return input_types + + RETURN_TYPES = ("VIDEO",) + RETURN_NAMES = ("video",) + + def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs): + video = kwargs.get("video") + num_frames = get_total_frames_from_length() + + if not video : + raise MoonvalleyApiError("video is required") + + + """Validate video input""" + video_url="" + if video: + validated_video = validate_input_video(video, num_frames, False) + video_url = upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs) + + control_type = kwargs.get("control_type") + motion_intensity = kwargs.get("motion_intensity") + + """Validate prompts and inference input""" + validate_prompts(prompt, negative_prompt) + inference_params=MoonvalleyVideoToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + control_params={'motion_intensity': motion_intensity} + ) + + control = self.parseControlParameter(control_type) + + request = MoonvalleyVideoToVideoRequest( + control_type=control, + video_url=video_url, + prompt_text=prompt, + inference_params=inference_params + ) + + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint(path=API_VIDEO2VIDEO_ENDPOINT, + method=HttpMethod.POST, + request_model=MoonvalleyVideoToVideoRequest, + response_model=MoonvalleyPromptResponse + ), + request=request, + auth_kwargs=kwargs, + ) + task_creation_response = initial_operation.execute() + validate_task_creation_response(task_creation_response) + task_id = task_creation_response.id + + final_response = self.get_response( + task_id, auth_kwargs=kwargs, node_id=unique_id + ) + + video = download_url_to_video_output(final_response.output_url) + + return (video, ) + +# --- MoonvalleyTxt2VideoNode --- +class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): + def __init__(self): + super().__init__() + + RETURN_TYPES = ("VIDEO",) + RETURN_NAMES = ("video",) + + @classmethod + def INPUT_TYPES(cls): + input_types = super().INPUT_TYPES() + # Remove image-specific parameters + for param in ["image"]: + if param in input_types["optional"]: + del input_types["optional"][param] + return input_types + + def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs): + validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) + num_frames = get_total_frames_from_length() + + inference_params=MoonvalleyTextToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + num_frames=num_frames, + width=width_height.get("width"), + height=width_height.get("height"), + ) + request = MoonvalleyTextToVideoRequest( + prompt_text=prompt, + inference_params=inference_params + ) + + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint(path=API_TXT2VIDEO_ENDPOINT, + method=HttpMethod.POST, + request_model=MoonvalleyTextToVideoRequest, + response_model=MoonvalleyPromptResponse + ), + request=request, + auth_kwargs=kwargs, + ) + task_creation_response = initial_operation.execute() + validate_task_creation_response(task_creation_response) + task_id = task_creation_response.id + + final_response = self.get_response( + task_id, auth_kwargs=kwargs, node_id=unique_id + ) + + video = download_url_to_video_output(final_response.output_url) + return (video, ) + + + +NODE_CLASS_MAPPINGS = { + "MoonvalleyImg2VideoNode": MoonvalleyImg2VideoNode, + "MoonvalleyTxt2VideoNode": MoonvalleyTxt2VideoNode, + # "MoonvalleyVideo2VideoNode": MoonvalleyVideo2VideoNode, +} + + +NODE_DISPLAY_NAME_MAPPINGS = { + "MoonvalleyImg2VideoNode": "Moonvalley Marey Image to Video", + "MoonvalleyTxt2VideoNode": "Moonvalley Marey Text to Video", + # "MoonvalleyVideo2VideoNode": "Moonvalley Marey Video to Video", +} + +def get_total_frames_from_length(length="5s"): + # if length == '5s': + # return 128 + # elif length == '10s': + # return 256 + return 128 + # else: + # raise MoonvalleyApiError("length is required") diff --git a/nodes.py b/nodes.py index 1b465b9e6..231d4d4de 100644 --- a/nodes.py +++ b/nodes.py @@ -2310,6 +2310,7 @@ def init_builtin_api_nodes(): "nodes_pika.py", "nodes_runway.py", "nodes_tripo.py", + "nodes_moonvalley.py", "nodes_rodin.py", "nodes_gemini.py", ] From c5de4955bb91a2b136027a698aaecb8d19e3d892 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 8 Jul 2025 08:56:38 -0400 Subject: [PATCH 009/759] ComfyUI version 0.3.44 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index c98c90499..7981fbaca 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.43" +__version__ = "0.3.44" diff --git a/pyproject.toml b/pyproject.toml index 9d0f90032..96ead2157 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.43" +version = "0.3.44" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 974254218ab873e8b9642b6a467d56842cd228c4 Mon Sep 17 00:00:00 2001 From: josephrocca <1167575+josephrocca@users.noreply.github.com> Date: Wed, 9 Jul 2025 03:56:59 +0800 Subject: [PATCH 010/759] Un-hardcode chroma patch_size (#8840) --- comfy/ldm/chroma/model.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py index c75023a31..06021d4f2 100644 --- a/comfy/ldm/chroma/model.py +++ b/comfy/ldm/chroma/model.py @@ -254,13 +254,12 @@ class Chroma(nn.Module): def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs): bs, c, h, w = x.shape - patch_size = 2 - x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size)) + x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size)) - img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size) + img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=self.patch_size, pw=self.patch_size) - h_len = ((h + (patch_size // 2)) // patch_size) - w_len = ((w + (patch_size // 2)) // patch_size) + h_len = ((h + (self.patch_size // 2)) // self.patch_size) + w_len = ((w + (self.patch_size // 2)) // self.patch_size) img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) @@ -268,4 +267,4 @@ class Chroma(nn.Module): txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype) out = self.forward_orig(img, img_ids, context, txt_ids, timestep, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None)) - return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w] + return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=self.patch_size, pw=self.patch_size)[:,:,:h,:w] From aac10ad23a8f65243fcfca7afa7a72c1740312d6 Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Wed, 9 Jul 2025 04:17:06 +0800 Subject: [PATCH 011/759] Add SA-Solver sampler (#8834) --- comfy/k_diffusion/sa_solver.py | 121 +++++++++++++++++++++++++++ comfy/k_diffusion/sampling.py | 111 ++++++++++++++++++++++++ comfy/samplers.py | 2 +- comfy_extras/nodes_custom_sampler.py | 45 ++++++++++ 4 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 comfy/k_diffusion/sa_solver.py diff --git a/comfy/k_diffusion/sa_solver.py b/comfy/k_diffusion/sa_solver.py new file mode 100644 index 000000000..0c6821b60 --- /dev/null +++ b/comfy/k_diffusion/sa_solver.py @@ -0,0 +1,121 @@ +# SA-Solver: Stochastic Adams Solver (NeurIPS 2023, arXiv:2309.05019) +# Conference: https://proceedings.neurips.cc/paper_files/paper/2023/file/f4a6806490d31216a3ba667eb240c897-Paper-Conference.pdf +# Codebase ref: https://github.com/scxue/SA-Solver + +import math +from typing import Union, Callable +import torch + + +def compute_exponential_coeffs(s: torch.Tensor, t: torch.Tensor, solver_order: int, tau_t: float) -> torch.Tensor: + """Compute (1 + tau^2) * integral of exp((1 + tau^2) * x) * x^p dx from s to t with exp((1 + tau^2) * t) factored out, using integration by parts. + + Integral of exp((1 + tau^2) * x) * x^p dx + = product_terms[p] - (p / (1 + tau^2)) * integral of exp((1 + tau^2) * x) * x^(p-1) dx, + with base case p=0 where integral equals product_terms[0]. + + where + product_terms[p] = x^p * exp((1 + tau^2) * x) / (1 + tau^2). + + Construct a recursive coefficient matrix following the above recursive relation to compute all integral terms up to p = (solver_order - 1). + Return coefficients used by the SA-Solver in data prediction mode. + + Args: + s: Start time s. + t: End time t. + solver_order: Current order of the solver. + tau_t: Stochastic strength parameter in the SDE. + + Returns: + Exponential coefficients used in data prediction, with exp((1 + tau^2) * t) factored out, ordered from p=0 to p=solver_order−1, shape (solver_order,). + """ + tau_mul = 1 + tau_t ** 2 + h = t - s + p = torch.arange(solver_order, dtype=s.dtype, device=s.device) + + # product_terms after factoring out exp((1 + tau^2) * t) + # Includes (1 + tau^2) factor from outside the integral + product_terms_factored = (t ** p - s ** p * (-tau_mul * h).exp()) + + # Lower triangular recursive coefficient matrix + # Accumulates recursive coefficients based on p / (1 + tau^2) + recursive_depth_mat = p.unsqueeze(1) - p.unsqueeze(0) + log_factorial = (p + 1).lgamma() + recursive_coeff_mat = log_factorial.unsqueeze(1) - log_factorial.unsqueeze(0) + if tau_t > 0: + recursive_coeff_mat = recursive_coeff_mat - (recursive_depth_mat * math.log(tau_mul)) + signs = torch.where(recursive_depth_mat % 2 == 0, 1.0, -1.0) + recursive_coeff_mat = (recursive_coeff_mat.exp() * signs).tril() + + return recursive_coeff_mat @ product_terms_factored + + +def compute_simple_stochastic_adams_b_coeffs(sigma_next: torch.Tensor, curr_lambdas: torch.Tensor, lambda_s: torch.Tensor, lambda_t: torch.Tensor, tau_t: float, is_corrector_step: bool = False) -> torch.Tensor: + """Compute simple order-2 b coefficients from SA-Solver paper (Appendix D. Implementation Details).""" + tau_mul = 1 + tau_t ** 2 + h = lambda_t - lambda_s + alpha_t = sigma_next * lambda_t.exp() + if is_corrector_step: + # Simplified 1-step (order-2) corrector + b_1 = alpha_t * (0.5 * tau_mul * h) + b_2 = alpha_t * (-h * tau_mul).expm1().neg() - b_1 + else: + # Simplified 2-step predictor + b_2 = alpha_t * (0.5 * tau_mul * h ** 2) / (curr_lambdas[-2] - lambda_s) + b_1 = alpha_t * (-h * tau_mul).expm1().neg() - b_2 + return torch.stack([b_2, b_1]) + + +def compute_stochastic_adams_b_coeffs(sigma_next: torch.Tensor, curr_lambdas: torch.Tensor, lambda_s: torch.Tensor, lambda_t: torch.Tensor, tau_t: float, simple_order_2: bool = False, is_corrector_step: bool = False) -> torch.Tensor: + """Compute b_i coefficients for the SA-Solver (see eqs. 15 and 18). + + The solver order corresponds to the number of input lambdas (half-logSNR points). + + Args: + sigma_next: Sigma at end time t. + curr_lambdas: Lambda time points used to construct the Lagrange basis, shape (N,). + lambda_s: Lambda at start time s. + lambda_t: Lambda at end time t. + tau_t: Stochastic strength parameter in the SDE. + simple_order_2: Whether to enable the simple order-2 scheme. + is_corrector_step: Flag for corrector step in simple order-2 mode. + + Returns: + b_i coefficients for the SA-Solver, shape (N,), where N is the solver order. + """ + num_timesteps = curr_lambdas.shape[0] + + if simple_order_2 and num_timesteps == 2: + return compute_simple_stochastic_adams_b_coeffs(sigma_next, curr_lambdas, lambda_s, lambda_t, tau_t, is_corrector_step) + + # Compute coefficients by solving a linear system from Lagrange basis interpolation + exp_integral_coeffs = compute_exponential_coeffs(lambda_s, lambda_t, num_timesteps, tau_t) + vandermonde_matrix_T = torch.vander(curr_lambdas, num_timesteps, increasing=True).T + lagrange_integrals = torch.linalg.solve(vandermonde_matrix_T, exp_integral_coeffs) + + # (sigma_t * exp(-tau^2 * lambda_t)) * exp((1 + tau^2) * lambda_t) + # = sigma_t * exp(lambda_t) = alpha_t + # exp((1 + tau^2) * lambda_t) is extracted from the integral + alpha_t = sigma_next * lambda_t.exp() + return alpha_t * lagrange_integrals + + +def get_tau_interval_func(start_sigma: float, end_sigma: float, eta: float = 1.0) -> Callable[[Union[torch.Tensor, float]], float]: + """Return a function that controls the stochasticity of SA-Solver. + + When eta = 0, SA-Solver runs as ODE. The official approach uses + time t to determine the SDE interval, while here we use sigma instead. + + See: + https://github.com/scxue/SA-Solver/blob/main/README.md + """ + + def tau_func(sigma: Union[torch.Tensor, float]) -> float: + if eta <= 0: + return 0.0 # ODE + + if isinstance(sigma, torch.Tensor): + sigma = sigma.item() + return eta if start_sigma >= sigma >= end_sigma else 0.0 + + return tau_func diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index 34218337a..2ed415b1f 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -9,6 +9,7 @@ from tqdm.auto import trange, tqdm from . import utils from . import deis +from . import sa_solver import comfy.model_patcher import comfy.model_sampling @@ -1648,3 +1649,113 @@ def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=Non if inject_noise: x = x + sigmas[i + 1] * (noise_coeff_3 * noise_1 + noise_coeff_2 * noise_2 + noise_coeff_1 * noise_3) * s_noise return x + + +@torch.no_grad() +def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, use_pece=False, simple_order_2=False): + """Stochastic Adams Solver with predictor-corrector method (NeurIPS 2023).""" + if len(sigmas) <= 1: + return x + extra_args = {} if extra_args is None else extra_args + seed = extra_args.get("seed", None) + noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler + s_in = x.new_ones([x.shape[0]]) + + model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling") + sigmas = offset_first_sigma_for_snr(sigmas, model_sampling) + lambdas = sigma_to_half_log_snr(sigmas, model_sampling=model_sampling) + + if tau_func is None: + # Use default interval for stochastic sampling + start_sigma = model_sampling.percent_to_sigma(0.2) + end_sigma = model_sampling.percent_to_sigma(0.8) + tau_func = sa_solver.get_tau_interval_func(start_sigma, end_sigma, eta=1.0) + + max_used_order = max(predictor_order, corrector_order) + x_pred = x # x: current state, x_pred: predicted next state + + h = 0.0 + tau_t = 0.0 + noise = 0.0 + pred_list = [] + + # Lower order near the end to improve stability + lower_order_to_end = sigmas[-1].item() == 0 + + for i in trange(len(sigmas) - 1, disable=disable): + # Evaluation + denoised = model(x_pred, sigmas[i] * s_in, **extra_args) + if callback is not None: + callback({"x": x_pred, "i": i, "sigma": sigmas[i], "sigma_hat": sigmas[i], "denoised": denoised}) + pred_list.append(denoised) + pred_list = pred_list[-max_used_order:] + + predictor_order_used = min(predictor_order, len(pred_list)) + if i == 0 or (sigmas[i + 1] == 0 and not use_pece): + corrector_order_used = 0 + else: + corrector_order_used = min(corrector_order, len(pred_list)) + + if lower_order_to_end: + predictor_order_used = min(predictor_order_used, len(sigmas) - 2 - i) + corrector_order_used = min(corrector_order_used, len(sigmas) - 1 - i) + + # Corrector + if corrector_order_used == 0: + # Update by the predicted state + x = x_pred + else: + curr_lambdas = lambdas[i - corrector_order_used + 1:i + 1] + b_coeffs = sa_solver.compute_stochastic_adams_b_coeffs( + sigmas[i], + curr_lambdas, + lambdas[i - 1], + lambdas[i], + tau_t, + simple_order_2, + is_corrector_step=True, + ) + pred_mat = torch.stack(pred_list[-corrector_order_used:], dim=1) # (B, K, ...) + corr_res = torch.tensordot(pred_mat, b_coeffs, dims=([1], [0])) # (B, ...) + x = sigmas[i] / sigmas[i - 1] * (-(tau_t ** 2) * h).exp() * x + corr_res + + if tau_t > 0 and s_noise > 0: + # The noise from the previous predictor step + x = x + noise + + if use_pece: + # Evaluate the corrected state + denoised = model(x, sigmas[i] * s_in, **extra_args) + pred_list[-1] = denoised + + # Predictor + if sigmas[i + 1] == 0: + # Denoising step + x = denoised + else: + tau_t = tau_func(sigmas[i + 1]) + curr_lambdas = lambdas[i - predictor_order_used + 1:i + 1] + b_coeffs = sa_solver.compute_stochastic_adams_b_coeffs( + sigmas[i + 1], + curr_lambdas, + lambdas[i], + lambdas[i + 1], + tau_t, + simple_order_2, + is_corrector_step=False, + ) + pred_mat = torch.stack(pred_list[-predictor_order_used:], dim=1) # (B, K, ...) + pred_res = torch.tensordot(pred_mat, b_coeffs, dims=([1], [0])) # (B, ...) + h = lambdas[i + 1] - lambdas[i] + x_pred = sigmas[i + 1] / sigmas[i] * (-(tau_t ** 2) * h).exp() * x + pred_res + + if tau_t > 0 and s_noise > 0: + noise = noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * tau_t ** 2 * h).expm1().neg().sqrt() * s_noise + x_pred = x_pred + noise + return x + + +@torch.no_grad() +def sample_sa_solver_pece(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, simple_order_2=False): + """Stochastic Adams Solver with PECE (Predict–Evaluate–Correct–Evaluate) mode (NeurIPS 2023).""" + return sample_sa_solver(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, tau_func=tau_func, s_noise=s_noise, noise_sampler=noise_sampler, predictor_order=predictor_order, corrector_order=corrector_order, use_pece=True, simple_order_2=simple_order_2) diff --git a/comfy/samplers.py b/comfy/samplers.py index 25ccaf39f..e93d2a315 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -720,7 +720,7 @@ KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_c "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu", "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm", "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "res_multistep_ancestral", "res_multistep_ancestral_cfg_pp", - "gradient_estimation", "gradient_estimation_cfg_pp", "er_sde", "seeds_2", "seeds_3"] + "gradient_estimation", "gradient_estimation_cfg_pp", "er_sde", "seeds_2", "seeds_3", "sa_solver", "sa_solver_pece"] class KSAMPLER(Sampler): def __init__(self, sampler_function, extra_options={}, inpaint_options={}): diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index b3a772714..33bc41842 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -2,6 +2,7 @@ import math import comfy.samplers import comfy.sample from comfy.k_diffusion import sampling as k_diffusion_sampling +from comfy.k_diffusion import sa_solver from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict import latent_preview import torch @@ -521,6 +522,49 @@ class SamplerER_SDE(ComfyNodeABC): return (sampler,) +class SamplerSASolver(ComfyNodeABC): + @classmethod + def INPUT_TYPES(cls) -> InputTypeDict: + return { + "required": { + "model": (IO.MODEL, {}), + "eta": (IO.FLOAT, {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01, "round": False},), + "sde_start_percent": (IO.FLOAT, {"default": 0.2, "min": 0.0, "max": 1.0, "step": 0.001},), + "sde_end_percent": (IO.FLOAT, {"default": 0.8, "min": 0.0, "max": 1.0, "step": 0.001},), + "s_noise": (IO.FLOAT, {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": False},), + "predictor_order": (IO.INT, {"default": 3, "min": 1, "max": 6}), + "corrector_order": (IO.INT, {"default": 4, "min": 0, "max": 6}), + "use_pece": (IO.BOOLEAN, {}), + "simple_order_2": (IO.BOOLEAN, {}), + } + } + + RETURN_TYPES = (IO.SAMPLER,) + CATEGORY = "sampling/custom_sampling/samplers" + + FUNCTION = "get_sampler" + + def get_sampler(self, model, eta, sde_start_percent, sde_end_percent, s_noise, predictor_order, corrector_order, use_pece, simple_order_2): + model_sampling = model.get_model_object("model_sampling") + start_sigma = model_sampling.percent_to_sigma(sde_start_percent) + end_sigma = model_sampling.percent_to_sigma(sde_end_percent) + tau_func = sa_solver.get_tau_interval_func(start_sigma, end_sigma, eta=eta) + + sampler_name = "sa_solver" + sampler = comfy.samplers.ksampler( + sampler_name, + { + "tau_func": tau_func, + "s_noise": s_noise, + "predictor_order": predictor_order, + "corrector_order": corrector_order, + "use_pece": use_pece, + "simple_order_2": simple_order_2, + }, + ) + return (sampler,) + + class Noise_EmptyNoise: def __init__(self): self.seed = 0 @@ -829,6 +873,7 @@ NODE_CLASS_MAPPINGS = { "SamplerDPMPP_2S_Ancestral": SamplerDPMPP_2S_Ancestral, "SamplerDPMAdaptative": SamplerDPMAdaptative, "SamplerER_SDE": SamplerER_SDE, + "SamplerSASolver": SamplerSASolver, "SplitSigmas": SplitSigmas, "SplitSigmasDenoise": SplitSigmasDenoise, "FlipSigmas": FlipSigmas, From 181a9bf26d4445e160645f6c81dc2ee29e7b6a08 Mon Sep 17 00:00:00 2001 From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> Date: Wed, 9 Jul 2025 08:18:04 +0800 Subject: [PATCH 012/759] Support Multi Image-Caption dataset in lora training node (#8819) * initial impl of multi img/text dataset * Update nodes_train.py * Support Kohya-ss structure --- comfy_extras/nodes_train.py | 125 +++++++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 10 deletions(-) diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index fbff01010..17caf5ad5 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -75,7 +75,7 @@ class BiasDiff(torch.nn.Module): return self.passive_memory_usage() -def load_and_process_images(image_files, input_dir, resize_method="None"): +def load_and_process_images(image_files, input_dir, resize_method="None", w=None, h=None): """Utility function to load and process a list of images. Args: @@ -90,7 +90,6 @@ def load_and_process_images(image_files, input_dir, resize_method="None"): raise ValueError("No valid images found in input") output_images = [] - w, h = None, None for file in image_files: image_path = os.path.join(input_dir, file) @@ -206,6 +205,103 @@ class LoadImageSetFromFolderNode: return (output_tensor,) +class LoadImageTextSetFromFolderNode: + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "folder": (folder_paths.get_input_subfolders(), {"tooltip": "The folder to load images from."}), + "clip": (IO.CLIP, {"tooltip": "The CLIP model used for encoding the text."}), + }, + "optional": { + "resize_method": ( + ["None", "Stretch", "Crop", "Pad"], + {"default": "None"}, + ), + "width": ( + IO.INT, + { + "default": -1, + "min": -1, + "max": 10000, + "step": 1, + "tooltip": "The width to resize the images to. -1 means use the original width.", + }, + ), + "height": ( + IO.INT, + { + "default": -1, + "min": -1, + "max": 10000, + "step": 1, + "tooltip": "The height to resize the images to. -1 means use the original height.", + }, + ) + }, + } + + RETURN_TYPES = ("IMAGE", IO.CONDITIONING,) + FUNCTION = "load_images" + CATEGORY = "loaders" + EXPERIMENTAL = True + DESCRIPTION = "Loads a batch of images and caption from a directory for training." + + def load_images(self, folder, clip, resize_method, width=None, height=None): + if clip is None: + raise RuntimeError("ERROR: clip input is invalid: None\n\nIf the clip is from a checkpoint loader node your checkpoint does not contain a valid clip or text encoder model.") + + logging.info(f"Loading images from folder: {folder}") + + sub_input_dir = os.path.join(folder_paths.get_input_directory(), folder) + valid_extensions = [".png", ".jpg", ".jpeg", ".webp"] + + image_files = [] + for item in os.listdir(sub_input_dir): + path = os.path.join(sub_input_dir, item) + if any(item.lower().endswith(ext) for ext in valid_extensions): + image_files.append(path) + elif os.path.isdir(path): + # Support kohya-ss/sd-scripts folder structure + repeat = 1 + if item.split("_")[0].isdigit(): + repeat = int(item.split("_")[0]) + image_files.extend([ + os.path.join(path, f) for f in os.listdir(path) if any(f.lower().endswith(ext) for ext in valid_extensions) + ] * repeat) + + caption_file_path = [ + f.replace(os.path.splitext(f)[1], ".txt") + for f in image_files + ] + captions = [] + for caption_file in caption_file_path: + caption_path = os.path.join(sub_input_dir, caption_file) + if os.path.exists(caption_path): + with open(caption_path, "r", encoding="utf-8") as f: + caption = f.read().strip() + captions.append(caption) + else: + captions.append("") + + width = width if width != -1 else None + height = height if height != -1 else None + output_tensor = load_and_process_images(image_files, sub_input_dir, resize_method, width, height) + + logging.info(f"Loaded {len(output_tensor)} images from {sub_input_dir}.") + + logging.info(f"Encoding captions from {sub_input_dir}.") + conditions = [] + empty_cond = clip.encode_from_tokens_scheduled(clip.tokenize("")) + for text in captions: + if text == "": + conditions.append(empty_cond) + tokens = clip.tokenize(text) + conditions.extend(clip.encode_from_tokens_scheduled(tokens)) + logging.info(f"Encoded {len(conditions)} captions from {sub_input_dir}.") + return (output_tensor, conditions) + + def draw_loss_graph(loss_map, steps): width, height = 500, 300 img = Image.new("RGB", (width, height), "white") @@ -381,6 +477,13 @@ class TrainLoraNode: latents = latents["samples"].to(dtype) num_images = latents.shape[0] + logging.info(f"Total Images: {num_images}, Total Captions: {len(positive)}") + if len(positive) == 1 and num_images > 1: + positive = positive * num_images + elif len(positive) != num_images: + raise ValueError( + f"Number of positive conditions ({len(positive)}) does not match number of images ({num_images})." + ) with torch.inference_mode(False): lora_sd = {} @@ -474,6 +577,7 @@ class TrainLoraNode: # setup models for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model): patch(m) + mp.model.requires_grad_(False) comfy.model_management.load_models_gpu([mp], memory_required=1e20, force_full_load=True) # Setup sampler and guider like in test script @@ -486,7 +590,6 @@ class TrainLoraNode: ) guider = comfy_extras.nodes_custom_sampler.Guider_Basic(mp) guider.set_conds(positive) # Set conditioning from input - ss = comfy_extras.nodes_custom_sampler.SamplerCustomAdvanced() # yoland: this currently resize to the first image in the dataset @@ -495,21 +598,21 @@ class TrainLoraNode: try: for step in (pbar:=tqdm.trange(steps, desc="Training LoRA", smoothing=0.01, disable=not comfy.utils.PROGRESS_BAR_ENABLED)): # Generate random sigma - sigma = mp.model.model_sampling.percent_to_sigma( + sigmas = [mp.model.model_sampling.percent_to_sigma( torch.rand((1,)).item() - ) - sigma = torch.tensor([sigma]) + ) for _ in range(min(batch_size, num_images))] + sigmas = torch.tensor(sigmas) noise = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(step * 1000 + seed) indices = torch.randperm(num_images)[:batch_size] - ss.sample( - noise, guider, train_sampler, sigma, {"samples": latents[indices].clone()} - ) + batch_latent = latents[indices].clone() + guider.set_conds([positive[i] for i in indices]) # Set conditioning from input + guider.sample(noise.generate_noise({"samples": batch_latent}), batch_latent, train_sampler, sigmas, seed=noise.seed) finally: for m in mp.model.modules(): unpatch(m) - del ss, train_sampler, optimizer + del train_sampler, optimizer torch.cuda.empty_cache() for adapter in all_weight_adapters: @@ -697,6 +800,7 @@ NODE_CLASS_MAPPINGS = { "SaveLoRANode": SaveLoRA, "LoraModelLoader": LoraModelLoader, "LoadImageSetFromFolderNode": LoadImageSetFromFolderNode, + "LoadImageTextSetFromFolderNode": LoadImageTextSetFromFolderNode, "LossGraphNode": LossGraphNode, } @@ -705,5 +809,6 @@ NODE_DISPLAY_NAME_MAPPINGS = { "SaveLoRANode": "Save LoRA Weights", "LoraModelLoader": "Load LoRA Model", "LoadImageSetFromFolderNode": "Load Image Dataset from Folder", + "LoadImageTextSetFromFolderNode": "Load Image and Text Dataset from Folder", "LossGraphNode": "Plot Loss Graph", } From 5612670ee48ce500aab98e362b3372ab06d1d659 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 9 Jul 2025 00:45:48 -0700 Subject: [PATCH 013/759] Remove unmaintained notebook. (#8845) --- README.md | 4 - notebooks/comfyui_colab.ipynb | 322 ---------------------------------- 2 files changed, 326 deletions(-) delete mode 100644 notebooks/comfyui_colab.ipynb diff --git a/README.md b/README.md index ba8892b17..0e021a687 100644 --- a/README.md +++ b/README.md @@ -178,10 +178,6 @@ If you have trouble extracting it, right click the file -> properties -> unblock See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor. -## Jupyter Notebook - -To run it on services like paperspace, kaggle or colab you can use my [Jupyter Notebook](notebooks/comfyui_colab.ipynb) - ## [comfy-cli](https://docs.comfy.org/comfy-cli/getting-started) diff --git a/notebooks/comfyui_colab.ipynb b/notebooks/comfyui_colab.ipynb deleted file mode 100644 index 5560b5ff9..000000000 --- a/notebooks/comfyui_colab.ipynb +++ /dev/null @@ -1,322 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "aaaaaaaaaa" - }, - "source": [ - "Git clone the repo and install the requirements. (ignore the pip errors about protobuf)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bbbbbbbbbb" - }, - "outputs": [], - "source": [ - "#@title Environment Setup\n", - "\n", - "\n", - "OPTIONS = {}\n", - "\n", - "USE_GOOGLE_DRIVE = False #@param {type:\"boolean\"}\n", - "UPDATE_COMFY_UI = True #@param {type:\"boolean\"}\n", - "WORKSPACE = 'ComfyUI'\n", - "OPTIONS['USE_GOOGLE_DRIVE'] = USE_GOOGLE_DRIVE\n", - "OPTIONS['UPDATE_COMFY_UI'] = UPDATE_COMFY_UI\n", - "\n", - "if OPTIONS['USE_GOOGLE_DRIVE']:\n", - " !echo \"Mounting Google Drive...\"\n", - " %cd /\n", - " \n", - " from google.colab import drive\n", - " drive.mount('/content/drive')\n", - "\n", - " WORKSPACE = \"/content/drive/MyDrive/ComfyUI\"\n", - " %cd /content/drive/MyDrive\n", - "\n", - "![ ! -d $WORKSPACE ] && echo -= Initial setup ComfyUI =- && git clone https://github.com/comfyanonymous/ComfyUI\n", - "%cd $WORKSPACE\n", - "\n", - "if OPTIONS['UPDATE_COMFY_UI']:\n", - " !echo -= Updating ComfyUI =-\n", - " !git pull\n", - "\n", - "!echo -= Install dependencies =-\n", - "!pip install xformers!=0.0.18 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu121 --extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu117" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cccccccccc" - }, - "source": [ - "Download some models/checkpoints/vae or custom comfyui nodes (uncomment the commands for the ones you want)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dddddddddd" - }, - "outputs": [], - "source": [ - "# Checkpoints\n", - "\n", - "### SDXL\n", - "### I recommend these workflow examples: https://comfyanonymous.github.io/ComfyUI_examples/sdxl/\n", - "\n", - "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors -P ./models/checkpoints/\n", - "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/resolve/main/sd_xl_refiner_1.0.safetensors -P ./models/checkpoints/\n", - "\n", - "# SDXL ReVision\n", - "#!wget -c https://huggingface.co/comfyanonymous/clip_vision_g/resolve/main/clip_vision_g.safetensors -P ./models/clip_vision/\n", - "\n", - "# SD1.5\n", - "!wget -c https://huggingface.co/Comfy-Org/stable-diffusion-v1-5-archive/resolve/main/v1-5-pruned-emaonly-fp16.safetensors -P ./models/checkpoints/\n", - "\n", - "# SD2\n", - "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-2-1-base/resolve/main/v2-1_512-ema-pruned.safetensors -P ./models/checkpoints/\n", - "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-ema-pruned.safetensors -P ./models/checkpoints/\n", - "\n", - "# Some SD1.5 anime style\n", - "#!wget -c https://huggingface.co/WarriorMama777/OrangeMixs/resolve/main/Models/AbyssOrangeMix2/AbyssOrangeMix2_hard.safetensors -P ./models/checkpoints/\n", - "#!wget -c https://huggingface.co/WarriorMama777/OrangeMixs/resolve/main/Models/AbyssOrangeMix3/AOM3A1_orangemixs.safetensors -P ./models/checkpoints/\n", - "#!wget -c https://huggingface.co/WarriorMama777/OrangeMixs/resolve/main/Models/AbyssOrangeMix3/AOM3A3_orangemixs.safetensors -P ./models/checkpoints/\n", - "#!wget -c https://huggingface.co/Linaqruf/anything-v3.0/resolve/main/anything-v3-fp16-pruned.safetensors -P ./models/checkpoints/\n", - "\n", - "# Waifu Diffusion 1.5 (anime style SD2.x 768-v)\n", - "#!wget -c https://huggingface.co/waifu-diffusion/wd-1-5-beta3/resolve/main/wd-illusion-fp16.safetensors -P ./models/checkpoints/\n", - "\n", - "\n", - "# unCLIP models\n", - "#!wget -c https://huggingface.co/comfyanonymous/illuminatiDiffusionV1_v11_unCLIP/resolve/main/illuminatiDiffusionV1_v11-unclip-h-fp16.safetensors -P ./models/checkpoints/\n", - "#!wget -c https://huggingface.co/comfyanonymous/wd-1.5-beta2_unCLIP/resolve/main/wd-1-5-beta2-aesthetic-unclip-h-fp16.safetensors -P ./models/checkpoints/\n", - "\n", - "\n", - "# VAE\n", - "!wget -c https://huggingface.co/stabilityai/sd-vae-ft-mse-original/resolve/main/vae-ft-mse-840000-ema-pruned.safetensors -P ./models/vae/\n", - "#!wget -c https://huggingface.co/WarriorMama777/OrangeMixs/resolve/main/VAEs/orangemix.vae.pt -P ./models/vae/\n", - "#!wget -c https://huggingface.co/hakurei/waifu-diffusion-v1-4/resolve/main/vae/kl-f8-anime2.ckpt -P ./models/vae/\n", - "\n", - "\n", - "# Loras\n", - "#!wget -c https://civitai.com/api/download/models/10350 -O ./models/loras/theovercomer8sContrastFix_sd21768.safetensors #theovercomer8sContrastFix SD2.x 768-v\n", - "#!wget -c https://civitai.com/api/download/models/10638 -O ./models/loras/theovercomer8sContrastFix_sd15.safetensors #theovercomer8sContrastFix SD1.x\n", - "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_offset_example-lora_1.0.safetensors -P ./models/loras/ #SDXL offset noise lora\n", - "\n", - "\n", - "# T2I-Adapter\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_depth_sd14v1.pth -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_seg_sd14v1.pth -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_sketch_sd14v1.pth -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_keypose_sd14v1.pth -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_openpose_sd14v1.pth -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_color_sd14v1.pth -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_canny_sd14v1.pth -P ./models/controlnet/\n", - "\n", - "# T2I Styles Model\n", - "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_style_sd14v1.pth -P ./models/style_models/\n", - "\n", - "# CLIPVision model (needed for styles model)\n", - "#!wget -c https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/pytorch_model.bin -O ./models/clip_vision/clip_vit14.bin\n", - "\n", - "\n", - "# ControlNet\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11e_sd15_ip2p_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11e_sd15_shuffle_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_canny_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11f1p_sd15_depth_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_inpaint_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_lineart_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_mlsd_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_normalbae_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_openpose_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_scribble_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_seg_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_softedge_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15s2_lineart_anime_fp16.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11u_sd15_tile_fp16.safetensors -P ./models/controlnet/\n", - "\n", - "# ControlNet SDXL\n", - "#!wget -c https://huggingface.co/stabilityai/control-lora/resolve/main/control-LoRAs-rank256/control-lora-canny-rank256.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/stabilityai/control-lora/resolve/main/control-LoRAs-rank256/control-lora-depth-rank256.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/stabilityai/control-lora/resolve/main/control-LoRAs-rank256/control-lora-recolor-rank256.safetensors -P ./models/controlnet/\n", - "#!wget -c https://huggingface.co/stabilityai/control-lora/resolve/main/control-LoRAs-rank256/control-lora-sketch-rank256.safetensors -P ./models/controlnet/\n", - "\n", - "# Controlnet Preprocessor nodes by Fannovel16\n", - "#!cd custom_nodes && git clone https://github.com/Fannovel16/comfy_controlnet_preprocessors; cd comfy_controlnet_preprocessors && python install.py\n", - "\n", - "\n", - "# GLIGEN\n", - "#!wget -c https://huggingface.co/comfyanonymous/GLIGEN_pruned_safetensors/resolve/main/gligen_sd14_textbox_pruned_fp16.safetensors -P ./models/gligen/\n", - "\n", - "\n", - "# ESRGAN upscale model\n", - "#!wget -c https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth -P ./models/upscale_models/\n", - "#!wget -c https://huggingface.co/sberbank-ai/Real-ESRGAN/resolve/main/RealESRGAN_x2.pth -P ./models/upscale_models/\n", - "#!wget -c https://huggingface.co/sberbank-ai/Real-ESRGAN/resolve/main/RealESRGAN_x4.pth -P ./models/upscale_models/\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kkkkkkkkkkkkkkk" - }, - "source": [ - "### Run ComfyUI with cloudflared (Recommended Way)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jjjjjjjjjjjjjj" - }, - "outputs": [], - "source": [ - "!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb\n", - "!dpkg -i cloudflared-linux-amd64.deb\n", - "\n", - "import subprocess\n", - "import threading\n", - "import time\n", - "import socket\n", - "import urllib.request\n", - "\n", - "def iframe_thread(port):\n", - " while True:\n", - " time.sleep(0.5)\n", - " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - " result = sock.connect_ex(('127.0.0.1', port))\n", - " if result == 0:\n", - " break\n", - " sock.close()\n", - " print(\"\\nComfyUI finished loading, trying to launch cloudflared (if it gets stuck here cloudflared is having issues)\\n\")\n", - "\n", - " p = subprocess.Popen([\"cloudflared\", \"tunnel\", \"--url\", \"http://127.0.0.1:{}\".format(port)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n", - " for line in p.stderr:\n", - " l = line.decode()\n", - " if \"trycloudflare.com \" in l:\n", - " print(\"This is the URL to access ComfyUI:\", l[l.find(\"http\"):], end='')\n", - " #print(l, end='')\n", - "\n", - "\n", - "threading.Thread(target=iframe_thread, daemon=True, args=(8188,)).start()\n", - "\n", - "!python main.py --dont-print-server" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kkkkkkkkkkkkkk" - }, - "source": [ - "### Run ComfyUI with localtunnel\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jjjjjjjjjjjjj" - }, - "outputs": [], - "source": [ - "!npm install -g localtunnel\n", - "\n", - "import threading\n", - "\n", - "def iframe_thread(port):\n", - " while True:\n", - " time.sleep(0.5)\n", - " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - " result = sock.connect_ex(('127.0.0.1', port))\n", - " if result == 0:\n", - " break\n", - " sock.close()\n", - " print(\"\\nComfyUI finished loading, trying to launch localtunnel (if it gets stuck here localtunnel is having issues)\\n\")\n", - "\n", - " print(\"The password/enpoint ip for localtunnel is:\", urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip(\"\\n\"))\n", - " p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n", - " for line in p.stdout:\n", - " print(line.decode(), end='')\n", - "\n", - "\n", - "threading.Thread(target=iframe_thread, daemon=True, args=(8188,)).start()\n", - "\n", - "!python main.py --dont-print-server" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gggggggggg" - }, - "source": [ - "### Run ComfyUI with colab iframe (use only in case the previous way with localtunnel doesn't work)\n", - "\n", - "You should see the ui appear in an iframe. If you get a 403 error, it's your firefox settings or an extension that's messing things up.\n", - "\n", - "If you want to open it in another window use the link.\n", - "\n", - "Note that some UI features like live image previews won't work because the colab iframe blocks websockets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hhhhhhhhhh" - }, - "outputs": [], - "source": [ - "import threading\n", - "def iframe_thread(port):\n", - " while True:\n", - " time.sleep(0.5)\n", - " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", - " result = sock.connect_ex(('127.0.0.1', port))\n", - " if result == 0:\n", - " break\n", - " sock.close()\n", - " from google.colab import output\n", - " output.serve_kernel_port_as_iframe(port, height=1024)\n", - " print(\"to open it in a window you can open this link here:\")\n", - " output.serve_kernel_port_as_window(port)\n", - "\n", - "threading.Thread(target=iframe_thread, daemon=True, args=(8188,)).start()\n", - "\n", - "!python main.py --dont-print-server" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "provenance": [] - }, - "gpuClass": "standard", - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} From 1205afc708d963d160f38c1d6613a384ddf6c564 Mon Sep 17 00:00:00 2001 From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> Date: Wed, 9 Jul 2025 23:41:22 +0800 Subject: [PATCH 014/759] Better training loop implementation (#8820) --- comfy_extras/nodes_train.py | 122 +++++++++++++++++++++++------------- 1 file changed, 80 insertions(+), 42 deletions(-) diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index 17caf5ad5..3d05fdab5 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -23,38 +23,78 @@ from comfy.comfy_types.node_typing import IO from comfy.weight_adapter import adapters +def make_batch_extra_option_dict(d, indicies, full_size=None): + new_dict = {} + for k, v in d.items(): + newv = v + if isinstance(v, dict): + newv = make_batch_extra_option_dict(v, indicies, full_size=full_size) + elif isinstance(v, torch.Tensor): + if full_size is None or v.size(0) == full_size: + newv = v[indicies] + elif isinstance(v, (list, tuple)) and len(v) == full_size: + newv = [v[i] for i in indicies] + new_dict[k] = newv + return new_dict + + class TrainSampler(comfy.samplers.Sampler): - def __init__(self, loss_fn, optimizer, loss_callback=None): + def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, total_steps=1, seed=0, training_dtype=torch.bfloat16): self.loss_fn = loss_fn self.optimizer = optimizer self.loss_callback = loss_callback + self.batch_size = batch_size + self.total_steps = total_steps + self.seed = seed + self.training_dtype = training_dtype def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False): - self.optimizer.zero_grad() - noise = model_wrap.inner_model.model_sampling.noise_scaling(sigmas, noise, latent_image, False) - latent = model_wrap.inner_model.model_sampling.noise_scaling( - torch.zeros_like(sigmas), - torch.zeros_like(noise, requires_grad=True), - latent_image, - False - ) + cond = model_wrap.conds["positive"] + dataset_size = sigmas.size(0) + torch.cuda.empty_cache() + for i in (pbar:=tqdm.trange(self.total_steps, desc="Training LoRA", smoothing=0.01, disable=not comfy.utils.PROGRESS_BAR_ENABLED)): + noisegen = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(self.seed + i * 1000) + indicies = torch.randperm(dataset_size)[:self.batch_size].tolist() - # Ensure model is in training mode and computing gradients - # x0 pred - denoised = model_wrap(noise, sigmas, **extra_args) - try: - loss = self.loss_fn(denoised, latent.clone()) - except RuntimeError as e: - if "does not require grad and does not have a grad_fn" in str(e): - logging.info("WARNING: This is likely due to the model is loaded in inference mode.") - loss.backward() - if self.loss_callback: - self.loss_callback(loss.item()) + batch_latent = torch.stack([latent_image[i] for i in indicies]) + batch_noise = noisegen.generate_noise({"samples": batch_latent}).to(batch_latent.device) + batch_sigmas = [ + model_wrap.inner_model.model_sampling.percent_to_sigma( + torch.rand((1,)).item() + ) for _ in range(min(self.batch_size, dataset_size)) + ] + batch_sigmas = torch.tensor(batch_sigmas).to(batch_latent.device) - self.optimizer.step() - # torch.cuda.memory._dump_snapshot("trainn.pickle") - # torch.cuda.memory._record_memory_history(enabled=None) + xt = model_wrap.inner_model.model_sampling.noise_scaling( + batch_sigmas, + batch_noise, + batch_latent, + False + ) + x0 = model_wrap.inner_model.model_sampling.noise_scaling( + torch.zeros_like(batch_sigmas), + torch.zeros_like(batch_noise), + batch_latent, + False + ) + + model_wrap.conds["positive"] = [ + cond[i] for i in indicies + ] + batch_extra_args = make_batch_extra_option_dict(extra_args, indicies, full_size=dataset_size) + + with torch.autocast(xt.device.type, dtype=self.training_dtype): + x0_pred = model_wrap(xt, batch_sigmas, **batch_extra_args) + loss = self.loss_fn(x0_pred, x0) + loss.backward() + if self.loss_callback: + self.loss_callback(loss.item()) + pbar.set_postfix({"loss": f"{loss.item():.4f}"}) + + self.optimizer.step() + self.optimizer.zero_grad() + torch.cuda.empty_cache() return torch.zeros_like(latent_image) @@ -584,36 +624,34 @@ class TrainLoraNode: loss_map = {"loss": []} def loss_callback(loss): loss_map["loss"].append(loss) - pbar.set_postfix({"loss": f"{loss:.4f}"}) train_sampler = TrainSampler( - criterion, optimizer, loss_callback=loss_callback + criterion, + optimizer, + loss_callback=loss_callback, + batch_size=batch_size, + total_steps=steps, + seed=seed, + training_dtype=dtype ) guider = comfy_extras.nodes_custom_sampler.Guider_Basic(mp) guider.set_conds(positive) # Set conditioning from input - # yoland: this currently resize to the first image in the dataset - # Training loop - torch.cuda.empty_cache() try: - for step in (pbar:=tqdm.trange(steps, desc="Training LoRA", smoothing=0.01, disable=not comfy.utils.PROGRESS_BAR_ENABLED)): - # Generate random sigma - sigmas = [mp.model.model_sampling.percent_to_sigma( - torch.rand((1,)).item() - ) for _ in range(min(batch_size, num_images))] - sigmas = torch.tensor(sigmas) - - noise = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(step * 1000 + seed) - - indices = torch.randperm(num_images)[:batch_size] - batch_latent = latents[indices].clone() - guider.set_conds([positive[i] for i in indices]) # Set conditioning from input - guider.sample(noise.generate_noise({"samples": batch_latent}), batch_latent, train_sampler, sigmas, seed=noise.seed) + # Generate dummy sigmas and noise + sigmas = torch.tensor(range(num_images)) + noise = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(seed) + guider.sample( + noise.generate_noise({"samples": latents}), + latents, + train_sampler, + sigmas, + seed=noise.seed + ) finally: for m in mp.model.modules(): unpatch(m) del train_sampler, optimizer - torch.cuda.empty_cache() for adapter in all_weight_adapters: adapter.requires_grad_(False) From 1fd306824d35bf2669f6be46fadab37efd7081c4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 9 Jul 2025 22:03:27 -0700 Subject: [PATCH 015/759] Add warning to catch torch import mistakes. (#8852) --- main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/main.py b/main.py index d488c0f4c..0b1987ef4 100644 --- a/main.py +++ b/main.py @@ -127,6 +127,9 @@ if __name__ == "__main__": import cuda_malloc +if 'torch' in sys.modules: + logging.warning("WARNING: Potential Error in code: Torch already imported, torch should never be imported before this point.") + import comfy.utils import execution From 2b653e8c18f18792e7f080df611c8a35f1d0fdf4 Mon Sep 17 00:00:00 2001 From: guill Date: Thu, 10 Jul 2025 11:46:19 -0700 Subject: [PATCH 016/759] Support for async node functions (#8830) * Support for async execution functions This commit adds support for node execution functions defined as async. When a node's execution function is defined as async, we can continue executing other nodes while it is processing. Standard uses of `await` should "just work", but people will still have to be careful if they spawn actual threads. Because torch doesn't really have async/await versions of functions, this won't particularly help with most locally-executing nodes, but it does work for e.g. web requests to other machines. In addition to the execute function, the `VALIDATE_INPUTS` and `check_lazy_status` functions can also be defined as async, though we'll only resolve one node at a time right now for those. * Add the execution model tests to CI * Add a missing file It looks like this got caught by .gitignore? There's probably a better place to put it, but I'm not sure what that is. * Add the websocket library for automated tests * Add additional tests for async error cases Also fixes one bug that was found when an async function throws an error after being scheduled on a task. * Add a feature flags message to reduce bandwidth We now only send 1 preview message of the latest type the client can support. We'll add a console warning when the client fails to send a feature flags message at some point in the future. * Add async tests to CI * Don't actually add new tests in this PR Will do it in a separate PR * Resolve unit test in GPU-less runner * Just remove the tests that GHA can't handle * Change line endings to UNIX-style * Avoid loading model_management.py so early Because model_management.py has a top-level `logging.info`, we have to be careful not to import that file before we call `setup_logging`. If we do, we end up having the default logging handler registered in addition to our custom one. --- comfy/utils.py | 5 +- comfy_api/feature_flags.py | 69 +++ comfy_execution/caching.py | 53 +-- comfy_execution/graph.py | 20 +- comfy_execution/progress.py | 347 +++++++++++++++ comfy_execution/utils.py | 46 ++ execution.py | 135 ++++-- main.py | 32 +- protocol.py | 7 + server.py | 98 ++++- tests-unit/feature_flags_test.py | 98 +++++ tests-unit/requirements.txt | 1 + tests-unit/websocket_feature_flags_test.py | 77 ++++ tests/inference/extra_model_paths.yaml | 2 +- tests/inference/test_async_nodes.py | 410 ++++++++++++++++++ tests/inference/test_execution.py | 65 ++- .../testing_nodes/testing-pack/__init__.py | 49 ++- .../testing-pack/async_test_nodes.py | 343 +++++++++++++++ .../testing-pack/specific_tests.py | 136 ++++++ 19 files changed, 1898 insertions(+), 95 deletions(-) create mode 100644 comfy_api/feature_flags.py create mode 100644 comfy_execution/progress.py create mode 100644 comfy_execution/utils.py create mode 100644 protocol.py create mode 100644 tests-unit/feature_flags_test.py create mode 100644 tests-unit/websocket_feature_flags_test.py create mode 100644 tests/inference/test_async_nodes.py create mode 100644 tests/inference/testing_nodes/testing-pack/async_test_nodes.py diff --git a/comfy/utils.py b/comfy/utils.py index 47981d8f6..f8e01f713 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -998,11 +998,12 @@ def set_progress_bar_global_hook(function): PROGRESS_BAR_HOOK = function class ProgressBar: - def __init__(self, total): + def __init__(self, total, node_id=None): global PROGRESS_BAR_HOOK self.total = total self.current = 0 self.hook = PROGRESS_BAR_HOOK + self.node_id = node_id def update_absolute(self, value, total=None, preview=None): if total is not None: @@ -1011,7 +1012,7 @@ class ProgressBar: value = self.total self.current = value if self.hook is not None: - self.hook(self.current, self.total, preview) + self.hook(self.current, self.total, preview, node_id=self.node_id) def update(self, value): self.update_absolute(self.current + value) diff --git a/comfy_api/feature_flags.py b/comfy_api/feature_flags.py new file mode 100644 index 000000000..0d4389a6e --- /dev/null +++ b/comfy_api/feature_flags.py @@ -0,0 +1,69 @@ +""" +Feature flags module for ComfyUI WebSocket protocol negotiation. + +This module handles capability negotiation between frontend and backend, +allowing graceful protocol evolution while maintaining backward compatibility. +""" + +from typing import Any, Dict + +from comfy.cli_args import args + +# Default server capabilities +SERVER_FEATURE_FLAGS: Dict[str, Any] = { + "supports_preview_metadata": True, + "max_upload_size": args.max_upload_size * 1024 * 1024, # Convert MB to bytes +} + + +def get_connection_feature( + sockets_metadata: Dict[str, Dict[str, Any]], + sid: str, + feature_name: str, + default: Any = False +) -> Any: + """ + Get a feature flag value for a specific connection. + + Args: + sockets_metadata: Dictionary of socket metadata + sid: Session ID of the connection + feature_name: Name of the feature to check + default: Default value if feature not found + + Returns: + Feature value or default if not found + """ + if sid not in sockets_metadata: + return default + + return sockets_metadata[sid].get("feature_flags", {}).get(feature_name, default) + + +def supports_feature( + sockets_metadata: Dict[str, Dict[str, Any]], + sid: str, + feature_name: str +) -> bool: + """ + Check if a connection supports a specific feature. + + Args: + sockets_metadata: Dictionary of socket metadata + sid: Session ID of the connection + feature_name: Name of the feature to check + + Returns: + Boolean indicating if feature is supported + """ + return get_connection_feature(sockets_metadata, sid, feature_name, False) is True + + +def get_server_features() -> Dict[str, Any]: + """ + Get the server's feature flags. + + Returns: + Dictionary of server feature flags + """ + return SERVER_FEATURE_FLAGS.copy() diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py index dbb37b89f..41224ce3b 100644 --- a/comfy_execution/caching.py +++ b/comfy_execution/caching.py @@ -1,6 +1,7 @@ import itertools from typing import Sequence, Mapping, Dict from comfy_execution.graph import DynamicPrompt +from abc import ABC, abstractmethod import nodes @@ -16,12 +17,13 @@ def include_unique_id_in_input(class_type: str) -> bool: NODE_CLASS_CONTAINS_UNIQUE_ID[class_type] = "UNIQUE_ID" in class_def.INPUT_TYPES().get("hidden", {}).values() return NODE_CLASS_CONTAINS_UNIQUE_ID[class_type] -class CacheKeySet: +class CacheKeySet(ABC): def __init__(self, dynprompt, node_ids, is_changed_cache): self.keys = {} self.subcache_keys = {} - def add_keys(self, node_ids): + @abstractmethod + async def add_keys(self, node_ids): raise NotImplementedError() def all_node_ids(self): @@ -60,9 +62,8 @@ class CacheKeySetID(CacheKeySet): def __init__(self, dynprompt, node_ids, is_changed_cache): super().__init__(dynprompt, node_ids, is_changed_cache) self.dynprompt = dynprompt - self.add_keys(node_ids) - def add_keys(self, node_ids): + async def add_keys(self, node_ids): for node_id in node_ids: if node_id in self.keys: continue @@ -77,37 +78,36 @@ class CacheKeySetInputSignature(CacheKeySet): super().__init__(dynprompt, node_ids, is_changed_cache) self.dynprompt = dynprompt self.is_changed_cache = is_changed_cache - self.add_keys(node_ids) def include_node_id_in_input(self) -> bool: return False - def add_keys(self, node_ids): + async def add_keys(self, node_ids): for node_id in node_ids: if node_id in self.keys: continue if not self.dynprompt.has_node(node_id): continue node = self.dynprompt.get_node(node_id) - self.keys[node_id] = self.get_node_signature(self.dynprompt, node_id) + self.keys[node_id] = await self.get_node_signature(self.dynprompt, node_id) self.subcache_keys[node_id] = (node_id, node["class_type"]) - def get_node_signature(self, dynprompt, node_id): + async def get_node_signature(self, dynprompt, node_id): signature = [] ancestors, order_mapping = self.get_ordered_ancestry(dynprompt, node_id) - signature.append(self.get_immediate_node_signature(dynprompt, node_id, order_mapping)) + signature.append(await self.get_immediate_node_signature(dynprompt, node_id, order_mapping)) for ancestor_id in ancestors: - signature.append(self.get_immediate_node_signature(dynprompt, ancestor_id, order_mapping)) + signature.append(await self.get_immediate_node_signature(dynprompt, ancestor_id, order_mapping)) return to_hashable(signature) - def get_immediate_node_signature(self, dynprompt, node_id, ancestor_order_mapping): + async def get_immediate_node_signature(self, dynprompt, node_id, ancestor_order_mapping): if not dynprompt.has_node(node_id): # This node doesn't exist -- we can't cache it. return [float("NaN")] node = dynprompt.get_node(node_id) class_type = node["class_type"] class_def = nodes.NODE_CLASS_MAPPINGS[class_type] - signature = [class_type, self.is_changed_cache.get(node_id)] + signature = [class_type, await self.is_changed_cache.get(node_id)] if self.include_node_id_in_input() or (hasattr(class_def, "NOT_IDEMPOTENT") and class_def.NOT_IDEMPOTENT) or include_unique_id_in_input(class_type): signature.append(node_id) inputs = node["inputs"] @@ -150,9 +150,10 @@ class BasicCache: self.cache = {} self.subcaches = {} - def set_prompt(self, dynprompt, node_ids, is_changed_cache): + async def set_prompt(self, dynprompt, node_ids, is_changed_cache): self.dynprompt = dynprompt self.cache_key_set = self.key_class(dynprompt, node_ids, is_changed_cache) + await self.cache_key_set.add_keys(node_ids) self.is_changed_cache = is_changed_cache self.initialized = True @@ -201,13 +202,13 @@ class BasicCache: else: return None - def _ensure_subcache(self, node_id, children_ids): + async def _ensure_subcache(self, node_id, children_ids): subcache_key = self.cache_key_set.get_subcache_key(node_id) subcache = self.subcaches.get(subcache_key, None) if subcache is None: subcache = BasicCache(self.key_class) self.subcaches[subcache_key] = subcache - subcache.set_prompt(self.dynprompt, children_ids, self.is_changed_cache) + await subcache.set_prompt(self.dynprompt, children_ids, self.is_changed_cache) return subcache def _get_subcache(self, node_id): @@ -259,10 +260,10 @@ class HierarchicalCache(BasicCache): assert cache is not None cache._set_immediate(node_id, value) - def ensure_subcache_for(self, node_id, children_ids): + async def ensure_subcache_for(self, node_id, children_ids): cache = self._get_cache_for(node_id) assert cache is not None - return cache._ensure_subcache(node_id, children_ids) + return await cache._ensure_subcache(node_id, children_ids) class LRUCache(BasicCache): def __init__(self, key_class, max_size=100): @@ -273,8 +274,8 @@ class LRUCache(BasicCache): self.used_generation = {} self.children = {} - def set_prompt(self, dynprompt, node_ids, is_changed_cache): - super().set_prompt(dynprompt, node_ids, is_changed_cache) + async def set_prompt(self, dynprompt, node_ids, is_changed_cache): + await super().set_prompt(dynprompt, node_ids, is_changed_cache) self.generation += 1 for node_id in node_ids: self._mark_used(node_id) @@ -303,11 +304,11 @@ class LRUCache(BasicCache): self._mark_used(node_id) return self._set_immediate(node_id, value) - def ensure_subcache_for(self, node_id, children_ids): + async def ensure_subcache_for(self, node_id, children_ids): # Just uses subcaches for tracking 'live' nodes - super()._ensure_subcache(node_id, children_ids) + await super()._ensure_subcache(node_id, children_ids) - self.cache_key_set.add_keys(children_ids) + await self.cache_key_set.add_keys(children_ids) self._mark_used(node_id) cache_key = self.cache_key_set.get_data_key(node_id) self.children[cache_key] = [] @@ -337,7 +338,7 @@ class DependencyAwareCache(BasicCache): self.ancestors = {} # Maps node_id -> set of ancestor node_ids self.executed_nodes = set() # Tracks nodes that have been executed - def set_prompt(self, dynprompt, node_ids, is_changed_cache): + async def set_prompt(self, dynprompt, node_ids, is_changed_cache): """ Clear the entire cache and rebuild the dependency graph. @@ -354,7 +355,7 @@ class DependencyAwareCache(BasicCache): self.executed_nodes.clear() # Call the parent method to initialize the cache with the new prompt - super().set_prompt(dynprompt, node_ids, is_changed_cache) + await super().set_prompt(dynprompt, node_ids, is_changed_cache) # Rebuild the dependency graph self._build_dependency_graph(dynprompt, node_ids) @@ -405,7 +406,7 @@ class DependencyAwareCache(BasicCache): """ return self._get_immediate(node_id) - def ensure_subcache_for(self, node_id, children_ids): + async def ensure_subcache_for(self, node_id, children_ids): """ Ensure a subcache exists for a node and update dependencies. @@ -416,7 +417,7 @@ class DependencyAwareCache(BasicCache): Returns: The subcache object for the node. """ - subcache = super()._ensure_subcache(node_id, children_ids) + subcache = await super()._ensure_subcache(node_id, children_ids) for child_id in children_ids: self.descendants[node_id].add(child_id) self.ancestors[child_id].add(node_id) diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index a2799b52e..c79243e1e 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -2,6 +2,7 @@ from __future__ import annotations from typing import Type, Literal import nodes +import asyncio from comfy_execution.graph_utils import is_link from comfy.comfy_types.node_typing import ComfyNodeABC, InputTypeDict, InputTypeOptions @@ -100,6 +101,8 @@ class TopologicalSort: self.pendingNodes = {} self.blockCount = {} # Number of nodes this node is directly blocked by self.blocking = {} # Which nodes are blocked by this node + self.externalBlocks = 0 + self.unblockedEvent = asyncio.Event() def get_input_info(self, unique_id, input_name): class_type = self.dynprompt.get_node(unique_id)["class_type"] @@ -153,6 +156,16 @@ class TopologicalSort: for link in links: self.add_strong_link(*link) + def add_external_block(self, node_id): + assert node_id in self.blockCount, "Can't add external block to a node that isn't pending" + self.externalBlocks += 1 + self.blockCount[node_id] += 1 + def unblock(): + self.externalBlocks -= 1 + self.blockCount[node_id] -= 1 + self.unblockedEvent.set() + return unblock + def is_cached(self, node_id): return False @@ -181,11 +194,16 @@ class ExecutionList(TopologicalSort): def is_cached(self, node_id): return self.output_cache.get(node_id) is not None - def stage_node_execution(self): + async def stage_node_execution(self): assert self.staged_node_id is None if self.is_empty(): return None, None, None available = self.get_ready_nodes() + while len(available) == 0 and self.externalBlocks > 0: + # Wait for an external block to be released + await self.unblockedEvent.wait() + self.unblockedEvent.clear() + available = self.get_ready_nodes() if len(available) == 0: cycled_nodes = self.get_nodes_in_cycle() # Because cycles composed entirely of static nodes are caught during initial validation, diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py new file mode 100644 index 000000000..5645b3e3c --- /dev/null +++ b/comfy_execution/progress.py @@ -0,0 +1,347 @@ +from typing import TypedDict, Dict, Optional +from typing_extensions import override +from PIL import Image +from enum import Enum +from abc import ABC +from tqdm import tqdm +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from comfy_execution.graph import DynamicPrompt +from protocol import BinaryEventTypes +from comfy_api import feature_flags + + +class NodeState(Enum): + Pending = "pending" + Running = "running" + Finished = "finished" + Error = "error" + + +class NodeProgressState(TypedDict): + """ + A class to represent the state of a node's progress. + """ + + state: NodeState + value: float + max: float + + +class ProgressHandler(ABC): + """ + Abstract base class for progress handlers. + Progress handlers receive progress updates and display them in various ways. + """ + + def __init__(self, name: str): + self.name = name + self.enabled = True + + def set_registry(self, registry: "ProgressRegistry"): + pass + + def start_handler(self, node_id: str, state: NodeProgressState, prompt_id: str): + """Called when a node starts processing""" + pass + + def update_handler( + self, + node_id: str, + value: float, + max_value: float, + state: NodeProgressState, + prompt_id: str, + image: Optional[Image.Image] = None, + ): + """Called when a node's progress is updated""" + pass + + def finish_handler(self, node_id: str, state: NodeProgressState, prompt_id: str): + """Called when a node finishes processing""" + pass + + def reset(self): + """Called when the progress registry is reset""" + pass + + def enable(self): + """Enable this handler""" + self.enabled = True + + def disable(self): + """Disable this handler""" + self.enabled = False + + +class CLIProgressHandler(ProgressHandler): + """ + Handler that displays progress using tqdm progress bars in the CLI. + """ + + def __init__(self): + super().__init__("cli") + self.progress_bars: Dict[str, tqdm] = {} + + @override + def start_handler(self, node_id: str, state: NodeProgressState, prompt_id: str): + # Create a new tqdm progress bar + if node_id not in self.progress_bars: + self.progress_bars[node_id] = tqdm( + total=state["max"], + desc=f"Node {node_id}", + unit="steps", + leave=True, + position=len(self.progress_bars), + ) + + @override + def update_handler( + self, + node_id: str, + value: float, + max_value: float, + state: NodeProgressState, + prompt_id: str, + image: Optional[Image.Image] = None, + ): + # Handle case where start_handler wasn't called + if node_id not in self.progress_bars: + self.progress_bars[node_id] = tqdm( + total=max_value, + desc=f"Node {node_id}", + unit="steps", + leave=True, + position=len(self.progress_bars), + ) + self.progress_bars[node_id].update(value) + else: + # Update existing progress bar + if max_value != self.progress_bars[node_id].total: + self.progress_bars[node_id].total = max_value + # Calculate the update amount (difference from current position) + current_position = self.progress_bars[node_id].n + update_amount = value - current_position + if update_amount > 0: + self.progress_bars[node_id].update(update_amount) + + @override + def finish_handler(self, node_id: str, state: NodeProgressState, prompt_id: str): + # Complete and close the progress bar if it exists + if node_id in self.progress_bars: + # Ensure the bar shows 100% completion + remaining = state["max"] - self.progress_bars[node_id].n + if remaining > 0: + self.progress_bars[node_id].update(remaining) + self.progress_bars[node_id].close() + del self.progress_bars[node_id] + + @override + def reset(self): + # Close all progress bars + for bar in self.progress_bars.values(): + bar.close() + self.progress_bars.clear() + + +class WebUIProgressHandler(ProgressHandler): + """ + Handler that sends progress updates to the WebUI via WebSockets. + """ + + def __init__(self, server_instance): + super().__init__("webui") + self.server_instance = server_instance + + def set_registry(self, registry: "ProgressRegistry"): + self.registry = registry + + def _send_progress_state(self, prompt_id: str, nodes: Dict[str, NodeProgressState]): + """Send the current progress state to the client""" + if self.server_instance is None: + return + + # Only send info for non-pending nodes + active_nodes = { + node_id: { + "value": state["value"], + "max": state["max"], + "state": state["state"].value, + "node_id": node_id, + "prompt_id": prompt_id, + "display_node_id": self.registry.dynprompt.get_display_node_id(node_id), + "parent_node_id": self.registry.dynprompt.get_parent_node_id(node_id), + "real_node_id": self.registry.dynprompt.get_real_node_id(node_id), + } + for node_id, state in nodes.items() + if state["state"] != NodeState.Pending + } + + # Send a combined progress_state message with all node states + self.server_instance.send_sync( + "progress_state", {"prompt_id": prompt_id, "nodes": active_nodes} + ) + + @override + def start_handler(self, node_id: str, state: NodeProgressState, prompt_id: str): + # Send progress state of all nodes + if self.registry: + self._send_progress_state(prompt_id, self.registry.nodes) + + @override + def update_handler( + self, + node_id: str, + value: float, + max_value: float, + state: NodeProgressState, + prompt_id: str, + image: Optional[Image.Image] = None, + ): + # Send progress state of all nodes + if self.registry: + self._send_progress_state(prompt_id, self.registry.nodes) + if image: + # Only send new format if client supports it + if feature_flags.supports_feature( + self.server_instance.sockets_metadata, + self.server_instance.client_id, + "supports_preview_metadata", + ): + metadata = { + "node_id": node_id, + "prompt_id": prompt_id, + "display_node_id": self.registry.dynprompt.get_display_node_id( + node_id + ), + "parent_node_id": self.registry.dynprompt.get_parent_node_id( + node_id + ), + "real_node_id": self.registry.dynprompt.get_real_node_id(node_id), + } + self.server_instance.send_sync( + BinaryEventTypes.PREVIEW_IMAGE_WITH_METADATA, + (image, metadata), + self.server_instance.client_id, + ) + + @override + def finish_handler(self, node_id: str, state: NodeProgressState, prompt_id: str): + # Send progress state of all nodes + if self.registry: + self._send_progress_state(prompt_id, self.registry.nodes) + + +class ProgressRegistry: + """ + Registry that maintains node progress state and notifies registered handlers. + """ + + def __init__(self, prompt_id: str, dynprompt: "DynamicPrompt"): + self.prompt_id = prompt_id + self.dynprompt = dynprompt + self.nodes: Dict[str, NodeProgressState] = {} + self.handlers: Dict[str, ProgressHandler] = {} + + def register_handler(self, handler: ProgressHandler) -> None: + """Register a progress handler""" + self.handlers[handler.name] = handler + + def unregister_handler(self, handler_name: str) -> None: + """Unregister a progress handler""" + if handler_name in self.handlers: + # Allow handler to clean up resources + self.handlers[handler_name].reset() + del self.handlers[handler_name] + + def enable_handler(self, handler_name: str) -> None: + """Enable a progress handler""" + if handler_name in self.handlers: + self.handlers[handler_name].enable() + + def disable_handler(self, handler_name: str) -> None: + """Disable a progress handler""" + if handler_name in self.handlers: + self.handlers[handler_name].disable() + + def ensure_entry(self, node_id: str) -> NodeProgressState: + """Ensure a node entry exists""" + if node_id not in self.nodes: + self.nodes[node_id] = NodeProgressState( + state=NodeState.Pending, value=0, max=1 + ) + return self.nodes[node_id] + + def start_progress(self, node_id: str) -> None: + """Start progress tracking for a node""" + entry = self.ensure_entry(node_id) + entry["state"] = NodeState.Running + entry["value"] = 0.0 + entry["max"] = 1.0 + + # Notify all enabled handlers + for handler in self.handlers.values(): + if handler.enabled: + handler.start_handler(node_id, entry, self.prompt_id) + + def update_progress( + self, node_id: str, value: float, max_value: float, image: Optional[Image.Image] + ) -> None: + """Update progress for a node""" + entry = self.ensure_entry(node_id) + entry["state"] = NodeState.Running + entry["value"] = value + entry["max"] = max_value + + # Notify all enabled handlers + for handler in self.handlers.values(): + if handler.enabled: + handler.update_handler( + node_id, value, max_value, entry, self.prompt_id, image + ) + + def finish_progress(self, node_id: str) -> None: + """Finish progress tracking for a node""" + entry = self.ensure_entry(node_id) + entry["state"] = NodeState.Finished + entry["value"] = entry["max"] + + # Notify all enabled handlers + for handler in self.handlers.values(): + if handler.enabled: + handler.finish_handler(node_id, entry, self.prompt_id) + + def reset_handlers(self) -> None: + """Reset all handlers""" + for handler in self.handlers.values(): + handler.reset() + +# Global registry instance +global_progress_registry: ProgressRegistry | None = None + +def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt") -> None: + global global_progress_registry + + # Reset existing handlers if registry exists + if global_progress_registry is not None: + global_progress_registry.reset_handlers() + + # Create new registry + global_progress_registry = ProgressRegistry(prompt_id, dynprompt) + + +def add_progress_handler(handler: ProgressHandler) -> None: + registry = get_progress_state() + handler.set_registry(registry) + registry.register_handler(handler) + + +def get_progress_state() -> ProgressRegistry: + global global_progress_registry + if global_progress_registry is None: + from comfy_execution.graph import DynamicPrompt + + global_progress_registry = ProgressRegistry( + prompt_id="", dynprompt=DynamicPrompt({}) + ) + return global_progress_registry diff --git a/comfy_execution/utils.py b/comfy_execution/utils.py new file mode 100644 index 000000000..62d32f101 --- /dev/null +++ b/comfy_execution/utils.py @@ -0,0 +1,46 @@ +import contextvars +from typing import Optional, NamedTuple + +class ExecutionContext(NamedTuple): + """ + Context information about the currently executing node. + + Attributes: + node_id: The ID of the currently executing node + list_index: The index in a list being processed (for operations on batches/lists) + """ + prompt_id: str + node_id: str + list_index: Optional[int] + +current_executing_context: contextvars.ContextVar[Optional[ExecutionContext]] = contextvars.ContextVar("current_executing_context", default=None) + +def get_executing_context() -> Optional[ExecutionContext]: + return current_executing_context.get(None) + +class CurrentNodeContext: + """ + Context manager for setting the current executing node context. + + Sets the current_executing_context on enter and resets it on exit. + + Example: + with CurrentNodeContext(node_id="123", list_index=0): + # Code that should run with the current node context set + process_image() + """ + def __init__(self, prompt_id: str, node_id: str, list_index: Optional[int] = None): + self.context = ExecutionContext( + prompt_id= prompt_id, + node_id= node_id, + list_index= list_index + ) + self.token = None + + def __enter__(self): + self.token = current_executing_context.set(self.context) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.token is not None: + current_executing_context.reset(self.token) diff --git a/execution.py b/execution.py index f6006fa12..90cefc023 100644 --- a/execution.py +++ b/execution.py @@ -8,12 +8,14 @@ import time import traceback from enum import Enum from typing import List, Literal, NamedTuple, Optional +import asyncio import torch import comfy.model_management import nodes from comfy_execution.caching import ( + BasicCache, CacheKeySetID, CacheKeySetInputSignature, DependencyAwareCache, @@ -28,6 +30,8 @@ from comfy_execution.graph import ( ) from comfy_execution.graph_utils import GraphBuilder, is_link from comfy_execution.validation import validate_node_input +from comfy_execution.progress import get_progress_state, reset_progress_state, add_progress_handler, WebUIProgressHandler +from comfy_execution.utils import CurrentNodeContext class ExecutionResult(Enum): @@ -39,12 +43,13 @@ class DuplicateNodeError(Exception): pass class IsChangedCache: - def __init__(self, dynprompt, outputs_cache): + def __init__(self, prompt_id: str, dynprompt: DynamicPrompt, outputs_cache: BasicCache): + self.prompt_id = prompt_id self.dynprompt = dynprompt self.outputs_cache = outputs_cache self.is_changed = {} - def get(self, node_id): + async def get(self, node_id): if node_id in self.is_changed: return self.is_changed[node_id] @@ -62,7 +67,8 @@ class IsChangedCache: # Intentionally do not use cached outputs here. We only want constants in IS_CHANGED input_data_all, _ = get_input_data(node["inputs"], class_def, node_id, None) try: - is_changed = _map_node_over_list(class_def, input_data_all, "IS_CHANGED") + is_changed = await _async_map_node_over_list(self.prompt_id, node_id, class_def, input_data_all, "IS_CHANGED") + is_changed = await resolve_map_node_over_list_results(is_changed) node["is_changed"] = [None if isinstance(x, ExecutionBlocker) else x for x in is_changed] except Exception as e: logging.warning("WARNING: {}".format(e)) @@ -164,7 +170,19 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e map_node_over_list = None #Don't hook this please -def _map_node_over_list(obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None): +async def resolve_map_node_over_list_results(results): + remaining = [x for x in results if isinstance(x, asyncio.Task) and not x.done()] + if len(remaining) == 0: + return [x.result() if isinstance(x, asyncio.Task) else x for x in results] + else: + done, pending = await asyncio.wait(remaining) + for task in done: + exc = task.exception() + if exc is not None: + raise exc + return [x.result() if isinstance(x, asyncio.Task) else x for x in results] + +async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None): # check if node wants the lists input_is_list = getattr(obj, "INPUT_IS_LIST", False) @@ -178,7 +196,7 @@ def _map_node_over_list(obj, input_data_all, func, allow_interrupt=False, execut return {k: v[i if len(v) > i else -1] for k, v in d.items()} results = [] - def process_inputs(inputs, index=None, input_is_list=False): + async def process_inputs(inputs, index=None, input_is_list=False): if allow_interrupt: nodes.before_node_execution() execution_block = None @@ -194,20 +212,37 @@ def _map_node_over_list(obj, input_data_all, func, allow_interrupt=False, execut if execution_block is None: if pre_execute_cb is not None and index is not None: pre_execute_cb(index) - results.append(getattr(obj, func)(**inputs)) + f = getattr(obj, func) + if inspect.iscoroutinefunction(f): + async def async_wrapper(f, prompt_id, unique_id, list_index, args): + with CurrentNodeContext(prompt_id, unique_id, list_index): + return await f(**args) + task = asyncio.create_task(async_wrapper(f, prompt_id, unique_id, index, args=inputs)) + # Give the task a chance to execute without yielding + await asyncio.sleep(0) + if task.done(): + result = task.result() + results.append(result) + else: + results.append(task) + else: + with CurrentNodeContext(prompt_id, unique_id, index): + result = f(**inputs) + results.append(result) else: results.append(execution_block) if input_is_list: - process_inputs(input_data_all, 0, input_is_list=input_is_list) + await process_inputs(input_data_all, 0, input_is_list=input_is_list) elif max_len_input == 0: - process_inputs({}) + await process_inputs({}) else: for i in range(max_len_input): input_dict = slice_dict(input_data_all, i) - process_inputs(input_dict, i) + await process_inputs(input_dict, i) return results + def merge_result_data(results, obj): # check which outputs need concatenating output = [] @@ -229,11 +264,18 @@ def merge_result_data(results, obj): output.append([o[i] for o in results]) return output -def get_output_data(obj, input_data_all, execution_block_cb=None, pre_execute_cb=None): +async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None): + return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb) + has_pending_task = any(isinstance(r, asyncio.Task) and not r.done() for r in return_values) + if has_pending_task: + return return_values, {}, False, has_pending_task + output, ui, has_subgraph = get_output_from_returns(return_values, obj) + return output, ui, has_subgraph, False + +def get_output_from_returns(return_values, obj): results = [] uis = [] subgraph_results = [] - return_values = _map_node_over_list(obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb) has_subgraph = False for i in range(len(return_values)): r = return_values[i] @@ -267,6 +309,10 @@ def get_output_data(obj, input_data_all, execution_block_cb=None, pre_execute_cb else: output = [] ui = dict() + # TODO: Think there's an existing bug here + # If we're performing a subgraph expansion, we probably shouldn't be returning UI values yet. + # They'll get cached without the completed subgraphs. It's an edge case and I'm not aware of + # any nodes that use both subgraph expansion and custom UI outputs, but might be a problem in the future. if len(uis) > 0: ui = {k: [y for x in uis for y in x[k]] for k in uis[0].keys()} return output, ui, has_subgraph @@ -279,7 +325,7 @@ def format_value(x): else: return str(x) -def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results): +async def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes): unique_id = current_item real_node_id = dynprompt.get_real_node_id(unique_id) display_node_id = dynprompt.get_display_node_id(unique_id) @@ -291,11 +337,26 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp if server.client_id is not None: cached_output = caches.ui.get(unique_id) or {} server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_output.get("output",None), "prompt_id": prompt_id }, server.client_id) + get_progress_state().finish_progress(unique_id) return (ExecutionResult.SUCCESS, None, None) input_data_all = None try: - if unique_id in pending_subgraph_results: + if unique_id in pending_async_nodes: + results = [] + for r in pending_async_nodes[unique_id]: + if isinstance(r, asyncio.Task): + try: + results.append(r.result()) + except Exception as ex: + # An async task failed - propagate the exception up + del pending_async_nodes[unique_id] + raise ex + else: + results.append(r) + del pending_async_nodes[unique_id] + output_data, output_ui, has_subgraph = get_output_from_returns(results, class_def) + elif unique_id in pending_subgraph_results: cached_results = pending_subgraph_results[unique_id] resolved_outputs = [] for is_subgraph, result in cached_results: @@ -317,6 +378,7 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp output_ui = [] has_subgraph = False else: + get_progress_state().start_progress(unique_id) input_data_all, missing_keys = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data) if server.client_id is not None: server.last_node_id = display_node_id @@ -328,7 +390,8 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp caches.objects.set(unique_id, obj) if hasattr(obj, "check_lazy_status"): - required_inputs = _map_node_over_list(obj, input_data_all, "check_lazy_status", allow_interrupt=True) + required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True) + required_inputs = await resolve_map_node_over_list_results(required_inputs) required_inputs = set(sum([r for r in required_inputs if isinstance(r,list)], [])) required_inputs = [x for x in required_inputs if isinstance(x,str) and ( x not in input_data_all or x in missing_keys @@ -357,8 +420,18 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp else: return block def pre_execute_cb(call_index): + # TODO - How to handle this with async functions without contextvars (which requires Python 3.12)? GraphBuilder.set_default_prefix(unique_id, call_index, 0) - output_data, output_ui, has_subgraph = get_output_data(obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb) + output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb) + if has_pending_tasks: + pending_async_nodes[unique_id] = output_data + unblock = execution_list.add_external_block(unique_id) + async def await_completion(): + tasks = [x for x in output_data if isinstance(x, asyncio.Task)] + await asyncio.gather(*tasks, return_exceptions=True) + unblock() + asyncio.create_task(await_completion()) + return (ExecutionResult.PENDING, None, None) if len(output_ui) > 0: caches.ui.set(unique_id, { "meta": { @@ -401,7 +474,8 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp cached_outputs.append((True, node_outputs)) new_node_ids = set(new_node_ids) for cache in caches.all: - cache.ensure_subcache_for(unique_id, new_node_ids).clean_unused() + subcache = await cache.ensure_subcache_for(unique_id, new_node_ids) + subcache.clean_unused() for node_id in new_output_ids: execution_list.add_node(node_id) for link in new_output_links: @@ -446,6 +520,7 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp return (ExecutionResult.FAILURE, error_details, ex) + get_progress_state().finish_progress(unique_id) executed.add(unique_id) return (ExecutionResult.SUCCESS, None, None) @@ -500,6 +575,11 @@ class PromptExecutor: self.add_message("execution_error", mes, broadcast=False) def execute(self, prompt, prompt_id, extra_data={}, execute_outputs=[]): + asyncio_loop = asyncio.new_event_loop() + asyncio.set_event_loop(asyncio_loop) + asyncio.run(self.execute_async(prompt, prompt_id, extra_data, execute_outputs)) + + async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs=[]): nodes.interrupt_processing(False) if "client_id" in extra_data: @@ -512,9 +592,11 @@ class PromptExecutor: with torch.inference_mode(): dynamic_prompt = DynamicPrompt(prompt) - is_changed_cache = IsChangedCache(dynamic_prompt, self.caches.outputs) + reset_progress_state(prompt_id, dynamic_prompt) + add_progress_handler(WebUIProgressHandler(self.server)) + is_changed_cache = IsChangedCache(prompt_id, dynamic_prompt, self.caches.outputs) for cache in self.caches.all: - cache.set_prompt(dynamic_prompt, prompt.keys(), is_changed_cache) + await cache.set_prompt(dynamic_prompt, prompt.keys(), is_changed_cache) cache.clean_unused() cached_nodes = [] @@ -527,6 +609,7 @@ class PromptExecutor: { "nodes": cached_nodes, "prompt_id": prompt_id}, broadcast=False) pending_subgraph_results = {} + pending_async_nodes = {} # TODO - Unify this with pending_subgraph_results executed = set() execution_list = ExecutionList(dynamic_prompt, self.caches.outputs) current_outputs = self.caches.outputs.all_node_ids() @@ -534,12 +617,13 @@ class PromptExecutor: execution_list.add_node(node_id) while not execution_list.is_empty(): - node_id, error, ex = execution_list.stage_node_execution() + node_id, error, ex = await execution_list.stage_node_execution() if error is not None: self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex) break - result, error, ex = execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results) + assert node_id is not None, "Node ID should not be None at this point" + result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes) self.success = result != ExecutionResult.FAILURE if result == ExecutionResult.FAILURE: self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex) @@ -569,7 +653,7 @@ class PromptExecutor: comfy.model_management.unload_all_models() -def validate_inputs(prompt, item, validated): +async def validate_inputs(prompt_id, prompt, item, validated): unique_id = item if unique_id in validated: return validated[unique_id] @@ -646,7 +730,7 @@ def validate_inputs(prompt, item, validated): errors.append(error) continue try: - r = validate_inputs(prompt, o_id, validated) + r = await validate_inputs(prompt_id, prompt, o_id, validated) if r[0] is False: # `r` will be set in `validated[o_id]` already valid = False @@ -771,7 +855,8 @@ def validate_inputs(prompt, item, validated): input_filtered['input_types'] = [received_types] #ret = obj_class.VALIDATE_INPUTS(**input_filtered) - ret = _map_node_over_list(obj_class, input_filtered, "VALIDATE_INPUTS") + ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, "VALIDATE_INPUTS") + ret = await resolve_map_node_over_list_results(ret) for x in input_filtered: for i, r in enumerate(ret): if r is not True and not isinstance(r, ExecutionBlocker): @@ -804,7 +889,7 @@ def full_type_name(klass): return klass.__qualname__ return module + '.' + klass.__qualname__ -def validate_prompt(prompt): +async def validate_prompt(prompt_id, prompt): outputs = set() for x in prompt: if 'class_type' not in prompt[x]: @@ -847,7 +932,7 @@ def validate_prompt(prompt): valid = False reasons = [] try: - m = validate_inputs(prompt, o, validated) + m = await validate_inputs(prompt_id, prompt, o, validated) valid = m[0] reasons = m[1] except Exception as ex: diff --git a/main.py b/main.py index 0b1987ef4..2b4ffafd4 100644 --- a/main.py +++ b/main.py @@ -11,6 +11,9 @@ import itertools import utils.extra_config import logging import sys +from comfy_execution.progress import get_progress_state +from comfy_execution.utils import get_executing_context +from comfy_api import feature_flags if __name__ == "__main__": #NOTE: These do not do anything on core ComfyUI, they are for custom nodes. @@ -134,7 +137,7 @@ import comfy.utils import execution import server -from server import BinaryEventTypes +from protocol import BinaryEventTypes import nodes import comfy.model_management import comfyui_version @@ -230,15 +233,34 @@ async def run(server_instance, address='', port=8188, verbose=True, call_on_star server_instance.start_multi_address(addresses, call_on_start, verbose), server_instance.publish_loop() ) - def hijack_progress(server_instance): - def hook(value, total, preview_image): + def hook(value, total, preview_image, prompt_id=None, node_id=None): + executing_context = get_executing_context() + if prompt_id is None and executing_context is not None: + prompt_id = executing_context.prompt_id + if node_id is None and executing_context is not None: + node_id = executing_context.node_id comfy.model_management.throw_exception_if_processing_interrupted() - progress = {"value": value, "max": total, "prompt_id": server_instance.last_prompt_id, "node": server_instance.last_node_id} + if prompt_id is None: + prompt_id = server_instance.last_prompt_id + if node_id is None: + node_id = server_instance.last_node_id + progress = {"value": value, "max": total, "prompt_id": prompt_id, "node": node_id} + get_progress_state().update_progress(node_id, value, total, preview_image) server_instance.send_sync("progress", progress, server_instance.client_id) if preview_image is not None: - server_instance.send_sync(BinaryEventTypes.UNENCODED_PREVIEW_IMAGE, preview_image, server_instance.client_id) + # Only send old method if client doesn't support preview metadata + if not feature_flags.supports_feature( + server_instance.sockets_metadata, + server_instance.client_id, + "supports_preview_metadata", + ): + server_instance.send_sync( + BinaryEventTypes.UNENCODED_PREVIEW_IMAGE, + preview_image, + server_instance.client_id, + ) comfy.utils.set_progress_bar_global_hook(hook) diff --git a/protocol.py b/protocol.py new file mode 100644 index 000000000..038a0a840 --- /dev/null +++ b/protocol.py @@ -0,0 +1,7 @@ + +class BinaryEventTypes: + PREVIEW_IMAGE = 1 + UNENCODED_PREVIEW_IMAGE = 2 + TEXT = 3 + PREVIEW_IMAGE_WITH_METADATA = 4 + diff --git a/server.py b/server.py index 878b5eeb1..e8bad9f4e 100644 --- a/server.py +++ b/server.py @@ -26,6 +26,7 @@ import mimetypes from comfy.cli_args import args import comfy.utils import comfy.model_management +from comfy_api import feature_flags import node_helpers from comfyui_version import __version__ from app.frontend_management import FrontendManager @@ -35,11 +36,7 @@ from app.model_manager import ModelFileManager from app.custom_node_manager import CustomNodeManager from typing import Optional, Union from api_server.routes.internal.internal_routes import InternalRoutes - -class BinaryEventTypes: - PREVIEW_IMAGE = 1 - UNENCODED_PREVIEW_IMAGE = 2 - TEXT = 3 +from protocol import BinaryEventTypes async def send_socket_catch_exception(function, message): try: @@ -178,6 +175,7 @@ class PromptServer(): max_upload_size = round(args.max_upload_size * 1024 * 1024) self.app = web.Application(client_max_size=max_upload_size, middlewares=middlewares) self.sockets = dict() + self.sockets_metadata = dict() self.web_root = ( FrontendManager.init_frontend(args.front_end_version) if args.front_end_root is None @@ -202,20 +200,53 @@ class PromptServer(): else: sid = uuid.uuid4().hex + # Store WebSocket for backward compatibility self.sockets[sid] = ws + # Store metadata separately + self.sockets_metadata[sid] = {"feature_flags": {}} try: # Send initial state to the new client - await self.send("status", { "status": self.get_queue_info(), 'sid': sid }, sid) + await self.send("status", {"status": self.get_queue_info(), "sid": sid}, sid) # On reconnect if we are the currently executing client send the current node if self.client_id == sid and self.last_node_id is not None: await self.send("executing", { "node": self.last_node_id }, sid) + # Flag to track if we've received the first message + first_message = True + async for msg in ws: if msg.type == aiohttp.WSMsgType.ERROR: logging.warning('ws connection closed with exception %s' % ws.exception()) + elif msg.type == aiohttp.WSMsgType.TEXT: + try: + data = json.loads(msg.data) + # Check if first message is feature flags + if first_message and data.get("type") == "feature_flags": + # Store client feature flags + client_flags = data.get("data", {}) + self.sockets_metadata[sid]["feature_flags"] = client_flags + + # Send server feature flags in response + await self.send( + "feature_flags", + feature_flags.get_server_features(), + sid, + ) + + logging.info( + f"Feature flags negotiated for client {sid}: {client_flags}" + ) + first_message = False + except json.JSONDecodeError: + logging.warning( + f"Invalid JSON received from client {sid}: {msg.data}" + ) + except Exception as e: + logging.error(f"Error processing WebSocket message: {e}") finally: self.sockets.pop(sid, None) + self.sockets_metadata.pop(sid, None) return ws @routes.get("/") @@ -548,6 +579,10 @@ class PromptServer(): } return web.json_response(system_stats) + @routes.get("/features") + async def get_features(request): + return web.json_response(feature_flags.get_server_features()) + @routes.get("/prompt") async def get_prompt(request): return web.json_response(self.get_queue_info()) @@ -643,7 +678,8 @@ class PromptServer(): if "prompt" in json_data: prompt = json_data["prompt"] - valid = execution.validate_prompt(prompt) + prompt_id = str(uuid.uuid4()) + valid = await execution.validate_prompt(prompt_id, prompt) extra_data = {} if "extra_data" in json_data: extra_data = json_data["extra_data"] @@ -651,7 +687,6 @@ class PromptServer(): if "client_id" in json_data: extra_data["client_id"] = json_data["client_id"] if valid[0]: - prompt_id = str(uuid.uuid4()) outputs_to_execute = valid[2] self.prompt_queue.put((number, prompt_id, prompt, extra_data, outputs_to_execute)) response = {"prompt_id": prompt_id, "number": number, "node_errors": valid[3]} @@ -766,6 +801,10 @@ class PromptServer(): async def send(self, event, data, sid=None): if event == BinaryEventTypes.UNENCODED_PREVIEW_IMAGE: await self.send_image(data, sid=sid) + elif event == BinaryEventTypes.PREVIEW_IMAGE_WITH_METADATA: + # data is (preview_image, metadata) + preview_image, metadata = data + await self.send_image_with_metadata(preview_image, metadata, sid=sid) elif isinstance(data, (bytes, bytearray)): await self.send_bytes(event, data, sid) else: @@ -804,6 +843,43 @@ class PromptServer(): preview_bytes = bytesIO.getvalue() await self.send_bytes(BinaryEventTypes.PREVIEW_IMAGE, preview_bytes, sid=sid) + async def send_image_with_metadata(self, image_data, metadata=None, sid=None): + image_type = image_data[0] + image = image_data[1] + max_size = image_data[2] + if max_size is not None: + if hasattr(Image, 'Resampling'): + resampling = Image.Resampling.BILINEAR + else: + resampling = Image.Resampling.LANCZOS + + image = ImageOps.contain(image, (max_size, max_size), resampling) + + mimetype = "image/png" if image_type == "PNG" else "image/jpeg" + + # Prepare metadata + if metadata is None: + metadata = {} + metadata["image_type"] = mimetype + + # Serialize metadata as JSON + import json + metadata_json = json.dumps(metadata).encode('utf-8') + metadata_length = len(metadata_json) + + # Prepare image data + bytesIO = BytesIO() + image.save(bytesIO, format=image_type, quality=95, compress_level=1) + image_bytes = bytesIO.getvalue() + + # Combine metadata and image + combined_data = bytearray() + combined_data.extend(struct.pack(">I", metadata_length)) + combined_data.extend(metadata_json) + combined_data.extend(image_bytes) + + await self.send_bytes(BinaryEventTypes.PREVIEW_IMAGE_WITH_METADATA, combined_data, sid=sid) + async def send_bytes(self, event, data, sid=None): message = self.encode_bytes(event, data) @@ -845,10 +921,10 @@ class PromptServer(): ssl_ctx = None scheme = "http" if args.tls_keyfile and args.tls_certfile: - ssl_ctx = ssl.SSLContext(protocol=ssl.PROTOCOL_TLS_SERVER, verify_mode=ssl.CERT_NONE) - ssl_ctx.load_cert_chain(certfile=args.tls_certfile, + ssl_ctx = ssl.SSLContext(protocol=ssl.PROTOCOL_TLS_SERVER, verify_mode=ssl.CERT_NONE) + ssl_ctx.load_cert_chain(certfile=args.tls_certfile, keyfile=args.tls_keyfile) - scheme = "https" + scheme = "https" if verbose: logging.info("Starting server\n") diff --git a/tests-unit/feature_flags_test.py b/tests-unit/feature_flags_test.py new file mode 100644 index 000000000..f2702cfc8 --- /dev/null +++ b/tests-unit/feature_flags_test.py @@ -0,0 +1,98 @@ +"""Tests for feature flags functionality.""" + +from comfy_api.feature_flags import ( + get_connection_feature, + supports_feature, + get_server_features, + SERVER_FEATURE_FLAGS, +) + + +class TestFeatureFlags: + """Test suite for feature flags functions.""" + + def test_get_server_features_returns_copy(self): + """Test that get_server_features returns a copy of the server flags.""" + features = get_server_features() + # Verify it's a copy by modifying it + features["test_flag"] = True + # Original should be unchanged + assert "test_flag" not in SERVER_FEATURE_FLAGS + + def test_get_server_features_contains_expected_flags(self): + """Test that server features contain expected flags.""" + features = get_server_features() + assert "supports_preview_metadata" in features + assert features["supports_preview_metadata"] is True + assert "max_upload_size" in features + assert isinstance(features["max_upload_size"], (int, float)) + + def test_get_connection_feature_with_missing_sid(self): + """Test getting feature for non-existent session ID.""" + sockets_metadata = {} + result = get_connection_feature(sockets_metadata, "missing_sid", "some_feature") + assert result is False # Default value + + def test_get_connection_feature_with_custom_default(self): + """Test getting feature with custom default value.""" + sockets_metadata = {} + result = get_connection_feature( + sockets_metadata, "missing_sid", "some_feature", default="custom_default" + ) + assert result == "custom_default" + + def test_get_connection_feature_with_feature_flags(self): + """Test getting feature from connection with feature flags.""" + sockets_metadata = { + "sid1": { + "feature_flags": { + "supports_preview_metadata": True, + "custom_feature": "value", + }, + } + } + result = get_connection_feature(sockets_metadata, "sid1", "supports_preview_metadata") + assert result is True + + result = get_connection_feature(sockets_metadata, "sid1", "custom_feature") + assert result == "value" + + def test_get_connection_feature_missing_feature(self): + """Test getting non-existent feature from connection.""" + sockets_metadata = { + "sid1": {"feature_flags": {"existing_feature": True}} + } + result = get_connection_feature(sockets_metadata, "sid1", "missing_feature") + assert result is False + + def test_supports_feature_returns_boolean(self): + """Test that supports_feature always returns boolean.""" + sockets_metadata = { + "sid1": { + "feature_flags": { + "bool_feature": True, + "string_feature": "value", + "none_feature": None, + }, + } + } + + # True boolean feature + assert supports_feature(sockets_metadata, "sid1", "bool_feature") is True + + # Non-boolean values should return False + assert supports_feature(sockets_metadata, "sid1", "string_feature") is False + assert supports_feature(sockets_metadata, "sid1", "none_feature") is False + assert supports_feature(sockets_metadata, "sid1", "missing_feature") is False + + def test_supports_feature_with_missing_connection(self): + """Test supports_feature with missing connection.""" + sockets_metadata = {} + assert supports_feature(sockets_metadata, "missing_sid", "any_feature") is False + + def test_empty_feature_flags_dict(self): + """Test connection with empty feature flags dictionary.""" + sockets_metadata = {"sid1": {"feature_flags": {}}} + result = get_connection_feature(sockets_metadata, "sid1", "any_feature") + assert result is False + assert supports_feature(sockets_metadata, "sid1", "any_feature") is False diff --git a/tests-unit/requirements.txt b/tests-unit/requirements.txt index d70d00f4b..3a6790ee0 100644 --- a/tests-unit/requirements.txt +++ b/tests-unit/requirements.txt @@ -1,3 +1,4 @@ pytest>=7.8.0 pytest-aiohttp pytest-asyncio +websocket-client diff --git a/tests-unit/websocket_feature_flags_test.py b/tests-unit/websocket_feature_flags_test.py new file mode 100644 index 000000000..e93b2e1dd --- /dev/null +++ b/tests-unit/websocket_feature_flags_test.py @@ -0,0 +1,77 @@ +"""Simplified tests for WebSocket feature flags functionality.""" +from comfy_api import feature_flags + + +class TestWebSocketFeatureFlags: + """Test suite for WebSocket feature flags integration.""" + + def test_server_feature_flags_response(self): + """Test server feature flags are properly formatted.""" + features = feature_flags.get_server_features() + + # Check expected server features + assert "supports_preview_metadata" in features + assert features["supports_preview_metadata"] is True + assert "max_upload_size" in features + assert isinstance(features["max_upload_size"], (int, float)) + + def test_progress_py_checks_feature_flags(self): + """Test that progress.py checks feature flags before sending metadata.""" + # This simulates the check in progress.py + client_id = "test_client" + sockets_metadata = {"test_client": {"feature_flags": {}}} + + # The actual check would be in progress.py + supports_metadata = feature_flags.supports_feature( + sockets_metadata, client_id, "supports_preview_metadata" + ) + + assert supports_metadata is False + + def test_multiple_clients_different_features(self): + """Test handling multiple clients with different feature support.""" + sockets_metadata = { + "modern_client": { + "feature_flags": {"supports_preview_metadata": True} + }, + "legacy_client": { + "feature_flags": {} + } + } + + # Check modern client + assert feature_flags.supports_feature( + sockets_metadata, "modern_client", "supports_preview_metadata" + ) is True + + # Check legacy client + assert feature_flags.supports_feature( + sockets_metadata, "legacy_client", "supports_preview_metadata" + ) is False + + def test_feature_negotiation_message_format(self): + """Test the format of feature negotiation messages.""" + # Client message format + client_message = { + "type": "feature_flags", + "data": { + "supports_preview_metadata": True, + "api_version": "1.0.0" + } + } + + # Verify structure + assert client_message["type"] == "feature_flags" + assert "supports_preview_metadata" in client_message["data"] + + # Server response format (what would be sent) + server_features = feature_flags.get_server_features() + server_message = { + "type": "feature_flags", + "data": server_features + } + + # Verify structure + assert server_message["type"] == "feature_flags" + assert "supports_preview_metadata" in server_message["data"] + assert server_message["data"]["supports_preview_metadata"] is True diff --git a/tests/inference/extra_model_paths.yaml b/tests/inference/extra_model_paths.yaml index 75b2e1ae4..68e056564 100644 --- a/tests/inference/extra_model_paths.yaml +++ b/tests/inference/extra_model_paths.yaml @@ -1,4 +1,4 @@ # Config for testing nodes testing: - custom_nodes: tests/inference/testing_nodes + custom_nodes: testing_nodes diff --git a/tests/inference/test_async_nodes.py b/tests/inference/test_async_nodes.py new file mode 100644 index 000000000..b243bbca9 --- /dev/null +++ b/tests/inference/test_async_nodes.py @@ -0,0 +1,410 @@ +import pytest +import time +import torch +import urllib.error +import numpy as np +import subprocess + +from pytest import fixture +from comfy_execution.graph_utils import GraphBuilder +from tests.inference.test_execution import ComfyClient + + +@pytest.mark.execution +class TestAsyncNodes: + @fixture(scope="class", autouse=True, params=[ + (False, 0), + (True, 0), + (True, 100), + ]) + def _server(self, args_pytest, request): + pargs = [ + 'python','main.py', + '--output-directory', args_pytest["output_dir"], + '--listen', args_pytest["listen"], + '--port', str(args_pytest["port"]), + '--extra-model-paths-config', 'tests/inference/extra_model_paths.yaml', + ] + use_lru, lru_size = request.param + if use_lru: + pargs += ['--cache-lru', str(lru_size)] + # Running server with args: pargs + p = subprocess.Popen(pargs) + yield + p.kill() + torch.cuda.empty_cache() + + @fixture(scope="class", autouse=True) + def shared_client(self, args_pytest, _server): + client = ComfyClient() + n_tries = 5 + for i in range(n_tries): + time.sleep(4) + try: + client.connect(listen=args_pytest["listen"], port=args_pytest["port"]) + except ConnectionRefusedError: + # Retrying... + pass + else: + break + yield client + del client + torch.cuda.empty_cache() + + @fixture + def client(self, shared_client, request): + shared_client.set_test_name(f"async_nodes[{request.node.name}]") + yield shared_client + + @fixture + def builder(self, request): + yield GraphBuilder(prefix=request.node.name) + + # Happy Path Tests + + def test_basic_async_execution(self, client: ComfyClient, builder: GraphBuilder): + """Test that a basic async node executes correctly.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + sleep_node = g.node("TestSleep", value=image.out(0), seconds=0.1) + output = g.node("SaveImage", images=sleep_node.out(0)) + + result = client.run(g) + + # Verify execution completed + assert result.did_run(sleep_node), "Async sleep node should have executed" + assert result.did_run(output), "Output node should have executed" + + # Verify the image passed through correctly + result_images = result.get_images(output) + assert len(result_images) == 1, "Should have 1 image" + assert np.array(result_images[0]).min() == 0 and np.array(result_images[0]).max() == 0, "Image should be black" + + def test_multiple_async_parallel_execution(self, client: ComfyClient, builder: GraphBuilder): + """Test that multiple async nodes execute in parallel.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + + # Create multiple async sleep nodes with different durations + sleep1 = g.node("TestSleep", value=image.out(0), seconds=0.3) + sleep2 = g.node("TestSleep", value=image.out(0), seconds=0.4) + sleep3 = g.node("TestSleep", value=image.out(0), seconds=0.5) + + # Add outputs for each + _output1 = g.node("PreviewImage", images=sleep1.out(0)) + _output2 = g.node("PreviewImage", images=sleep2.out(0)) + _output3 = g.node("PreviewImage", images=sleep3.out(0)) + + start_time = time.time() + result = client.run(g) + elapsed_time = time.time() - start_time + + # Should take ~0.5s (max duration) not 1.2s (sum of durations) + assert elapsed_time < 0.8, f"Parallel execution took {elapsed_time}s, expected < 0.8s" + + # Verify all nodes executed + assert result.did_run(sleep1) and result.did_run(sleep2) and result.did_run(sleep3) + + def test_async_with_dependencies(self, client: ComfyClient, builder: GraphBuilder): + """Test async nodes with proper dependency handling.""" + g = builder + image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + + # Chain of async operations + sleep1 = g.node("TestSleep", value=image1.out(0), seconds=0.2) + sleep2 = g.node("TestSleep", value=image2.out(0), seconds=0.2) + + # Average depends on both async results + average = g.node("TestVariadicAverage", input1=sleep1.out(0), input2=sleep2.out(0)) + output = g.node("SaveImage", images=average.out(0)) + + result = client.run(g) + + # Verify execution order + assert result.did_run(sleep1) and result.did_run(sleep2) + assert result.did_run(average) and result.did_run(output) + + # Verify averaged result + result_images = result.get_images(output) + avg_value = np.array(result_images[0]).mean() + assert abs(avg_value - 127.5) < 1, f"Average value {avg_value} should be ~127.5" + + def test_async_validate_inputs(self, client: ComfyClient, builder: GraphBuilder): + """Test async VALIDATE_INPUTS function.""" + g = builder + # Create a test node with async validation + validation_node = g.node("TestAsyncValidation", value=5.0, threshold=10.0) + g.node("SaveImage", images=validation_node.out(0)) + + # Should pass validation + result = client.run(g) + assert result.did_run(validation_node) + + # Test validation failure + validation_node.inputs['threshold'] = 3.0 # Will fail since value > threshold + with pytest.raises(urllib.error.HTTPError): + client.run(g) + + def test_async_lazy_evaluation(self, client: ComfyClient, builder: GraphBuilder): + """Test async nodes with lazy evaluation.""" + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + mask = g.node("StubMask", value=0.0, height=512, width=512, batch_size=1) + + # Create async nodes that will be evaluated lazily + sleep1 = g.node("TestSleep", value=input1.out(0), seconds=0.3) + sleep2 = g.node("TestSleep", value=input2.out(0), seconds=0.3) + + # Use lazy mix that only needs sleep1 (mask=0.0) + lazy_mix = g.node("TestLazyMixImages", image1=sleep1.out(0), image2=sleep2.out(0), mask=mask.out(0)) + g.node("SaveImage", images=lazy_mix.out(0)) + + start_time = time.time() + result = client.run(g) + elapsed_time = time.time() - start_time + + # Should only execute sleep1, not sleep2 + assert elapsed_time < 0.5, f"Should skip sleep2, took {elapsed_time}s" + assert result.did_run(sleep1), "Sleep1 should have executed" + assert not result.did_run(sleep2), "Sleep2 should have been skipped" + + def test_async_check_lazy_status(self, client: ComfyClient, builder: GraphBuilder): + """Test async check_lazy_status function.""" + g = builder + # Create a node with async check_lazy_status + lazy_node = g.node("TestAsyncLazyCheck", + input1="value1", + input2="value2", + condition=True) + g.node("SaveImage", images=lazy_node.out(0)) + + result = client.run(g) + assert result.did_run(lazy_node) + + # Error Handling Tests + + def test_async_execution_error(self, client: ComfyClient, builder: GraphBuilder): + """Test that async execution errors are properly handled.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + # Create an async node that will error + error_node = g.node("TestAsyncError", value=image.out(0), error_after=0.1) + g.node("SaveImage", images=error_node.out(0)) + + try: + client.run(g) + assert False, "Should have raised an error" + except Exception as e: + assert 'prompt_id' in e.args[0], f"Did not get proper error message: {e}" + assert e.args[0]['node_id'] == error_node.id, "Error should be from async error node" + + def test_async_validation_error(self, client: ComfyClient, builder: GraphBuilder): + """Test async validation error handling.""" + g = builder + # Node with async validation that will fail + validation_node = g.node("TestAsyncValidationError", value=15.0, max_value=10.0) + g.node("SaveImage", images=validation_node.out(0)) + + with pytest.raises(urllib.error.HTTPError) as exc_info: + client.run(g) + # Verify it's a validation error + assert exc_info.value.code == 400 + + def test_async_timeout_handling(self, client: ComfyClient, builder: GraphBuilder): + """Test handling of async operations that timeout.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + # Very long sleep that would timeout + timeout_node = g.node("TestAsyncTimeout", value=image.out(0), timeout=0.5, operation_time=2.0) + g.node("SaveImage", images=timeout_node.out(0)) + + try: + client.run(g) + assert False, "Should have raised a timeout error" + except Exception as e: + assert 'timeout' in str(e).lower(), f"Expected timeout error, got: {e}" + + def test_concurrent_async_error_recovery(self, client: ComfyClient, builder: GraphBuilder): + """Test that workflow can recover after async errors.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + + # First run with error + error_node = g.node("TestAsyncError", value=image.out(0), error_after=0.1) + g.node("SaveImage", images=error_node.out(0)) + + try: + client.run(g) + except Exception: + pass # Expected + + # Second run should succeed + g2 = GraphBuilder(prefix="recovery_test") + image2 = g2.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + sleep_node = g2.node("TestSleep", value=image2.out(0), seconds=0.1) + g2.node("SaveImage", images=sleep_node.out(0)) + + result = client.run(g2) + assert result.did_run(sleep_node), "Should be able to run after error" + + def test_sync_error_during_async_execution(self, client: ComfyClient, builder: GraphBuilder): + """Test handling when sync node errors while async node is executing.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + + # Async node that takes time + sleep_node = g.node("TestSleep", value=image.out(0), seconds=0.5) + + # Sync node that will error immediately + error_node = g.node("TestSyncError", value=image.out(0)) + + # Both feed into output + g.node("PreviewImage", images=sleep_node.out(0)) + g.node("PreviewImage", images=error_node.out(0)) + + try: + client.run(g) + assert False, "Should have raised an error" + except Exception as e: + # Verify the sync error was caught even though async was running + assert 'prompt_id' in e.args[0] + + # Edge Cases + + def test_async_with_execution_blocker(self, client: ComfyClient, builder: GraphBuilder): + """Test async nodes with execution blockers.""" + g = builder + image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + + # Async sleep nodes + sleep1 = g.node("TestSleep", value=image1.out(0), seconds=0.2) + sleep2 = g.node("TestSleep", value=image2.out(0), seconds=0.2) + + # Create list of images + image_list = g.node("TestMakeListNode", value1=sleep1.out(0), value2=sleep2.out(0)) + + # Create list of blocking conditions - [False, True] to block only the second item + int1 = g.node("StubInt", value=1) + int2 = g.node("StubInt", value=2) + block_list = g.node("TestMakeListNode", value1=int1.out(0), value2=int2.out(0)) + + # Compare each value against 2, so first is False (1 != 2) and second is True (2 == 2) + compare = g.node("TestIntConditions", a=block_list.out(0), b=2, operation="==") + + # Block based on the comparison results + blocker = g.node("TestExecutionBlocker", input=image_list.out(0), block=compare.out(0), verbose=False) + + output = g.node("PreviewImage", images=blocker.out(0)) + + result = client.run(g) + images = result.get_images(output) + assert len(images) == 1, "Should have blocked second image" + + def test_async_caching_behavior(self, client: ComfyClient, builder: GraphBuilder): + """Test that async nodes are properly cached.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + sleep_node = g.node("TestSleep", value=image.out(0), seconds=0.2) + g.node("SaveImage", images=sleep_node.out(0)) + + # First run + result1 = client.run(g) + assert result1.did_run(sleep_node), "Should run first time" + + # Second run - should be cached + start_time = time.time() + result2 = client.run(g) + elapsed_time = time.time() - start_time + + assert not result2.did_run(sleep_node), "Should be cached" + assert elapsed_time < 0.1, f"Cached run took {elapsed_time}s, should be instant" + + def test_async_with_dynamic_prompts(self, client: ComfyClient, builder: GraphBuilder): + """Test async nodes within dynamically generated prompts.""" + g = builder + image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + + # Node that generates async nodes dynamically + dynamic_async = g.node("TestDynamicAsyncGeneration", + image1=image1.out(0), + image2=image2.out(0), + num_async_nodes=3, + sleep_duration=0.2) + g.node("SaveImage", images=dynamic_async.out(0)) + + start_time = time.time() + result = client.run(g) + elapsed_time = time.time() - start_time + + # Should execute async nodes in parallel within dynamic prompt + assert elapsed_time < 0.5, f"Dynamic async execution took {elapsed_time}s" + assert result.did_run(dynamic_async) + + def test_async_resource_cleanup(self, client: ComfyClient, builder: GraphBuilder): + """Test that async resources are properly cleaned up.""" + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + + # Create multiple async nodes that use resources + resource_nodes = [] + for i in range(5): + node = g.node("TestAsyncResourceUser", + value=image.out(0), + resource_id=f"resource_{i}", + duration=0.1) + resource_nodes.append(node) + g.node("PreviewImage", images=node.out(0)) + + result = client.run(g) + + # Verify all nodes executed + for node in resource_nodes: + assert result.did_run(node) + + # Run again to ensure resources were cleaned up + result2 = client.run(g) + # Should be cached but not error due to resource conflicts + for node in resource_nodes: + assert not result2.did_run(node), "Should be cached" + + def test_async_cancellation(self, client: ComfyClient, builder: GraphBuilder): + """Test cancellation of async operations.""" + # This would require implementing cancellation in the client + # For now, we'll test that long-running async operations can be interrupted + pass # TODO: Implement when cancellation API is available + + def test_mixed_sync_async_execution(self, client: ComfyClient, builder: GraphBuilder): + """Test workflows with both sync and async nodes.""" + g = builder + image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + mask = g.node("StubMask", value=0.5, height=512, width=512, batch_size=1) + + # Mix of sync and async operations + # Sync: lazy mix images + sync_op1 = g.node("TestLazyMixImages", image1=image1.out(0), image2=image2.out(0), mask=mask.out(0)) + # Async: sleep + async_op1 = g.node("TestSleep", value=sync_op1.out(0), seconds=0.2) + # Sync: custom validation + sync_op2 = g.node("TestCustomValidation1", input1=async_op1.out(0), input2=0.5) + # Async: sleep again + async_op2 = g.node("TestSleep", value=sync_op2.out(0), seconds=0.2) + + output = g.node("SaveImage", images=async_op2.out(0)) + + result = client.run(g) + + # Verify all nodes executed in correct order + assert result.did_run(sync_op1) + assert result.did_run(async_op1) + assert result.did_run(sync_op2) + assert result.did_run(async_op2) + + # Image should be a mix of black and white (gray) + result_images = result.get_images(output) + avg_value = np.array(result_images[0]).mean() + assert abs(avg_value - 63.75) < 5, f"Average value {avg_value} should be ~63.75" diff --git a/tests/inference/test_execution.py b/tests/inference/test_execution.py index 5cda5c1ae..9d3d685cc 100644 --- a/tests/inference/test_execution.py +++ b/tests/inference/test_execution.py @@ -252,7 +252,7 @@ class TestExecution: @pytest.mark.parametrize("test_type, test_value", [ ("StubInt", 5), - ("StubFloat", 5.0) + ("StubMask", 5.0) ]) def test_validation_error_edge1(self, test_type, test_value, client: ComfyClient, builder: GraphBuilder): g = builder @@ -497,6 +497,69 @@ class TestExecution: assert numpy.array(images[0]).min() == 63 and numpy.array(images[0]).max() == 63, "Image should have value 0.25" assert not result.did_run(test_node), "The execution should have been cached" + def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder): + g = builder + image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + + # Create sleep nodes for each duration + sleep_node1 = g.node("TestSleep", value=image.out(0), seconds=2.8) + sleep_node2 = g.node("TestSleep", value=image.out(0), seconds=2.9) + sleep_node3 = g.node("TestSleep", value=image.out(0), seconds=3.0) + + # Add outputs to verify the execution + _output1 = g.node("PreviewImage", images=sleep_node1.out(0)) + _output2 = g.node("PreviewImage", images=sleep_node2.out(0)) + _output3 = g.node("PreviewImage", images=sleep_node3.out(0)) + + start_time = time.time() + result = client.run(g) + elapsed_time = time.time() - start_time + + # The test should take around 0.4 seconds (the longest sleep duration) + # plus some overhead, but definitely less than the sum of all sleeps (0.9s) + # We'll allow for up to 0.8s total to account for overhead + assert elapsed_time < 4.0, f"Parallel execution took {elapsed_time}s, expected less than 0.8s" + + # Verify that all nodes executed + assert result.did_run(sleep_node1), "Sleep node 1 should have run" + assert result.did_run(sleep_node2), "Sleep node 2 should have run" + assert result.did_run(sleep_node3), "Sleep node 3 should have run" + + def test_parallel_sleep_expansion(self, client: ComfyClient, builder: GraphBuilder): + g = builder + # Create input images with different values + image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + image3 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + + # Create a TestParallelSleep node that expands into multiple TestSleep nodes + parallel_sleep = g.node("TestParallelSleep", + image1=image1.out(0), + image2=image2.out(0), + image3=image3.out(0), + sleep1=0.4, + sleep2=0.5, + sleep3=0.6) + output = g.node("SaveImage", images=parallel_sleep.out(0)) + + start_time = time.time() + result = client.run(g) + elapsed_time = time.time() - start_time + + # Similar to the previous test, expect parallel execution of the sleep nodes + # which should complete in less than the sum of all sleeps + assert elapsed_time < 0.8, f"Expansion execution took {elapsed_time}s, expected less than 0.8s" + + # Verify the parallel sleep node executed + assert result.did_run(parallel_sleep), "ParallelSleep node should have run" + + # Verify we get an image as output (blend of the three input images) + result_images = result.get_images(output) + assert len(result_images) == 1, "Should have 1 image" + # Average pixel value should be around 170 (255 * 2 // 3) + avg_value = numpy.array(result_images[0]).mean() + assert avg_value == 170, f"Image average value {avg_value} should be 170" + # This tests that nodes with OUTPUT_IS_LIST function correctly when they receive an ExecutionBlocker # as input. We also test that when that list (containing an ExecutionBlocker) is passed to a node, # only that one entry in the list is blocked. diff --git a/tests/inference/testing_nodes/testing-pack/__init__.py b/tests/inference/testing_nodes/testing-pack/__init__.py index dcc71659a..20f9533c7 100644 --- a/tests/inference/testing_nodes/testing-pack/__init__.py +++ b/tests/inference/testing_nodes/testing-pack/__init__.py @@ -1,23 +1,26 @@ -from .specific_tests import TEST_NODE_CLASS_MAPPINGS, TEST_NODE_DISPLAY_NAME_MAPPINGS -from .flow_control import FLOW_CONTROL_NODE_CLASS_MAPPINGS, FLOW_CONTROL_NODE_DISPLAY_NAME_MAPPINGS -from .util import UTILITY_NODE_CLASS_MAPPINGS, UTILITY_NODE_DISPLAY_NAME_MAPPINGS -from .conditions import CONDITION_NODE_CLASS_MAPPINGS, CONDITION_NODE_DISPLAY_NAME_MAPPINGS -from .stubs import TEST_STUB_NODE_CLASS_MAPPINGS, TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS - -# NODE_CLASS_MAPPINGS = GENERAL_NODE_CLASS_MAPPINGS.update(COMPONENT_NODE_CLASS_MAPPINGS) -# NODE_DISPLAY_NAME_MAPPINGS = GENERAL_NODE_DISPLAY_NAME_MAPPINGS.update(COMPONENT_NODE_DISPLAY_NAME_MAPPINGS) - -NODE_CLASS_MAPPINGS = {} -NODE_CLASS_MAPPINGS.update(TEST_NODE_CLASS_MAPPINGS) -NODE_CLASS_MAPPINGS.update(FLOW_CONTROL_NODE_CLASS_MAPPINGS) -NODE_CLASS_MAPPINGS.update(UTILITY_NODE_CLASS_MAPPINGS) -NODE_CLASS_MAPPINGS.update(CONDITION_NODE_CLASS_MAPPINGS) -NODE_CLASS_MAPPINGS.update(TEST_STUB_NODE_CLASS_MAPPINGS) - -NODE_DISPLAY_NAME_MAPPINGS = {} -NODE_DISPLAY_NAME_MAPPINGS.update(TEST_NODE_DISPLAY_NAME_MAPPINGS) -NODE_DISPLAY_NAME_MAPPINGS.update(FLOW_CONTROL_NODE_DISPLAY_NAME_MAPPINGS) -NODE_DISPLAY_NAME_MAPPINGS.update(UTILITY_NODE_DISPLAY_NAME_MAPPINGS) -NODE_DISPLAY_NAME_MAPPINGS.update(CONDITION_NODE_DISPLAY_NAME_MAPPINGS) -NODE_DISPLAY_NAME_MAPPINGS.update(TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS) - +from .specific_tests import TEST_NODE_CLASS_MAPPINGS, TEST_NODE_DISPLAY_NAME_MAPPINGS +from .flow_control import FLOW_CONTROL_NODE_CLASS_MAPPINGS, FLOW_CONTROL_NODE_DISPLAY_NAME_MAPPINGS +from .util import UTILITY_NODE_CLASS_MAPPINGS, UTILITY_NODE_DISPLAY_NAME_MAPPINGS +from .conditions import CONDITION_NODE_CLASS_MAPPINGS, CONDITION_NODE_DISPLAY_NAME_MAPPINGS +from .stubs import TEST_STUB_NODE_CLASS_MAPPINGS, TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS +from .async_test_nodes import ASYNC_TEST_NODE_CLASS_MAPPINGS, ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS + +# NODE_CLASS_MAPPINGS = GENERAL_NODE_CLASS_MAPPINGS.update(COMPONENT_NODE_CLASS_MAPPINGS) +# NODE_DISPLAY_NAME_MAPPINGS = GENERAL_NODE_DISPLAY_NAME_MAPPINGS.update(COMPONENT_NODE_DISPLAY_NAME_MAPPINGS) + +NODE_CLASS_MAPPINGS = {} +NODE_CLASS_MAPPINGS.update(TEST_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(FLOW_CONTROL_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(UTILITY_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(CONDITION_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(TEST_STUB_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(ASYNC_TEST_NODE_CLASS_MAPPINGS) + +NODE_DISPLAY_NAME_MAPPINGS = {} +NODE_DISPLAY_NAME_MAPPINGS.update(TEST_NODE_DISPLAY_NAME_MAPPINGS) +NODE_DISPLAY_NAME_MAPPINGS.update(FLOW_CONTROL_NODE_DISPLAY_NAME_MAPPINGS) +NODE_DISPLAY_NAME_MAPPINGS.update(UTILITY_NODE_DISPLAY_NAME_MAPPINGS) +NODE_DISPLAY_NAME_MAPPINGS.update(CONDITION_NODE_DISPLAY_NAME_MAPPINGS) +NODE_DISPLAY_NAME_MAPPINGS.update(TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS) +NODE_DISPLAY_NAME_MAPPINGS.update(ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS) + diff --git a/tests/inference/testing_nodes/testing-pack/async_test_nodes.py b/tests/inference/testing_nodes/testing-pack/async_test_nodes.py new file mode 100644 index 000000000..547eea6f4 --- /dev/null +++ b/tests/inference/testing_nodes/testing-pack/async_test_nodes.py @@ -0,0 +1,343 @@ +import torch +import asyncio +from typing import Dict +from comfy.utils import ProgressBar +from comfy_execution.graph_utils import GraphBuilder +from comfy.comfy_types.node_typing import ComfyNodeABC +from comfy.comfy_types import IO + + +class TestAsyncValidation(ComfyNodeABC): + """Test node with async VALIDATE_INPUTS.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": ("FLOAT", {"default": 5.0}), + "threshold": ("FLOAT", {"default": 10.0}), + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "process" + CATEGORY = "_for_testing/async" + + @classmethod + async def VALIDATE_INPUTS(cls, value, threshold): + # Simulate async validation (e.g., checking remote service) + await asyncio.sleep(0.05) + + if value > threshold: + return f"Value {value} exceeds threshold {threshold}" + return True + + def process(self, value, threshold): + # Create image based on value + intensity = value / 10.0 + image = torch.ones([1, 512, 512, 3]) * intensity + return (image,) + + +class TestAsyncError(ComfyNodeABC): + """Test node that errors during async execution.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": (IO.ANY, {}), + "error_after": ("FLOAT", {"default": 0.1, "min": 0.0, "max": 10.0}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "error_execution" + CATEGORY = "_for_testing/async" + + async def error_execution(self, value, error_after): + await asyncio.sleep(error_after) + raise RuntimeError("Intentional async execution error for testing") + + +class TestAsyncValidationError(ComfyNodeABC): + """Test node with async validation that always fails.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": ("FLOAT", {"default": 5.0}), + "max_value": ("FLOAT", {"default": 10.0}), + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "process" + CATEGORY = "_for_testing/async" + + @classmethod + async def VALIDATE_INPUTS(cls, value, max_value): + await asyncio.sleep(0.05) + # Always fail validation for values > max_value + if value > max_value: + return f"Async validation failed: {value} > {max_value}" + return True + + def process(self, value, max_value): + # This won't be reached if validation fails + image = torch.ones([1, 512, 512, 3]) * (value / max_value) + return (image,) + + +class TestAsyncTimeout(ComfyNodeABC): + """Test node that simulates timeout scenarios.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": (IO.ANY, {}), + "timeout": ("FLOAT", {"default": 1.0, "min": 0.1, "max": 10.0}), + "operation_time": ("FLOAT", {"default": 2.0, "min": 0.1, "max": 10.0}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "timeout_execution" + CATEGORY = "_for_testing/async" + + async def timeout_execution(self, value, timeout, operation_time): + try: + # This will timeout if operation_time > timeout + await asyncio.wait_for(asyncio.sleep(operation_time), timeout=timeout) + return (value,) + except asyncio.TimeoutError: + raise RuntimeError(f"Operation timed out after {timeout} seconds") + + +class TestSyncError(ComfyNodeABC): + """Test node that errors synchronously (for mixed sync/async testing).""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": (IO.ANY, {}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "sync_error" + CATEGORY = "_for_testing/async" + + def sync_error(self, value): + raise RuntimeError("Intentional sync execution error for testing") + + +class TestAsyncLazyCheck(ComfyNodeABC): + """Test node with async check_lazy_status.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "input1": (IO.ANY, {"lazy": True}), + "input2": (IO.ANY, {"lazy": True}), + "condition": ("BOOLEAN", {"default": True}), + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "process" + CATEGORY = "_for_testing/async" + + async def check_lazy_status(self, condition, input1, input2): + # Simulate async checking (e.g., querying remote service) + await asyncio.sleep(0.05) + + needed = [] + if condition and input1 is None: + needed.append("input1") + if not condition and input2 is None: + needed.append("input2") + return needed + + def process(self, input1, input2, condition): + # Return a simple image + return (torch.ones([1, 512, 512, 3]),) + + +class TestDynamicAsyncGeneration(ComfyNodeABC): + """Test node that dynamically generates async nodes.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "image1": ("IMAGE",), + "image2": ("IMAGE",), + "num_async_nodes": ("INT", {"default": 3, "min": 1, "max": 10}), + "sleep_duration": ("FLOAT", {"default": 0.2, "min": 0.1, "max": 1.0}), + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "generate_async_workflow" + CATEGORY = "_for_testing/async" + + def generate_async_workflow(self, image1, image2, num_async_nodes, sleep_duration): + g = GraphBuilder() + + # Create multiple async sleep nodes + sleep_nodes = [] + for i in range(num_async_nodes): + image = image1 if i % 2 == 0 else image2 + sleep_node = g.node("TestSleep", value=image, seconds=sleep_duration) + sleep_nodes.append(sleep_node) + + # Average all results + if len(sleep_nodes) == 1: + final_node = sleep_nodes[0] + else: + avg_inputs = {"input1": sleep_nodes[0].out(0)} + for i, node in enumerate(sleep_nodes[1:], 2): + avg_inputs[f"input{i}"] = node.out(0) + final_node = g.node("TestVariadicAverage", **avg_inputs) + + return { + "result": (final_node.out(0),), + "expand": g.finalize(), + } + + +class TestAsyncResourceUser(ComfyNodeABC): + """Test node that uses resources during async execution.""" + + # Class-level resource tracking for testing + _active_resources: Dict[str, bool] = {} + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": (IO.ANY, {}), + "resource_id": ("STRING", {"default": "resource_0"}), + "duration": ("FLOAT", {"default": 0.1, "min": 0.0, "max": 1.0}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "use_resource" + CATEGORY = "_for_testing/async" + + async def use_resource(self, value, resource_id, duration): + # Check if resource is already in use + if self._active_resources.get(resource_id, False): + raise RuntimeError(f"Resource {resource_id} is already in use!") + + # Mark resource as in use + self._active_resources[resource_id] = True + + try: + # Simulate resource usage + await asyncio.sleep(duration) + return (value,) + finally: + # Always clean up resource + self._active_resources[resource_id] = False + + +class TestAsyncBatchProcessing(ComfyNodeABC): + """Test async processing of batched inputs.""" + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "images": ("IMAGE",), + "process_time_per_item": ("FLOAT", {"default": 0.1, "min": 0.01, "max": 1.0}), + }, + "hidden": { + "unique_id": "UNIQUE_ID", + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "process_batch" + CATEGORY = "_for_testing/async" + + async def process_batch(self, images, process_time_per_item, unique_id): + batch_size = images.shape[0] + pbar = ProgressBar(batch_size, node_id=unique_id) + + # Process each image in the batch + processed = [] + for i in range(batch_size): + # Simulate async processing + await asyncio.sleep(process_time_per_item) + + # Simple processing: invert the image + processed_image = 1.0 - images[i:i+1] + processed.append(processed_image) + + pbar.update(1) + + # Stack processed images + result = torch.cat(processed, dim=0) + return (result,) + + +class TestAsyncConcurrentLimit(ComfyNodeABC): + """Test concurrent execution limits for async nodes.""" + + _semaphore = asyncio.Semaphore(2) # Only allow 2 concurrent executions + + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": (IO.ANY, {}), + "duration": ("FLOAT", {"default": 0.5, "min": 0.1, "max": 2.0}), + "node_id": ("INT", {"default": 0}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "limited_execution" + CATEGORY = "_for_testing/async" + + async def limited_execution(self, value, duration, node_id): + async with self._semaphore: + # Node {node_id} acquired semaphore + await asyncio.sleep(duration) + # Node {node_id} releasing semaphore + return (value,) + + +# Add node mappings +ASYNC_TEST_NODE_CLASS_MAPPINGS = { + "TestAsyncValidation": TestAsyncValidation, + "TestAsyncError": TestAsyncError, + "TestAsyncValidationError": TestAsyncValidationError, + "TestAsyncTimeout": TestAsyncTimeout, + "TestSyncError": TestSyncError, + "TestAsyncLazyCheck": TestAsyncLazyCheck, + "TestDynamicAsyncGeneration": TestDynamicAsyncGeneration, + "TestAsyncResourceUser": TestAsyncResourceUser, + "TestAsyncBatchProcessing": TestAsyncBatchProcessing, + "TestAsyncConcurrentLimit": TestAsyncConcurrentLimit, +} + +ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS = { + "TestAsyncValidation": "Test Async Validation", + "TestAsyncError": "Test Async Error", + "TestAsyncValidationError": "Test Async Validation Error", + "TestAsyncTimeout": "Test Async Timeout", + "TestSyncError": "Test Sync Error", + "TestAsyncLazyCheck": "Test Async Lazy Check", + "TestDynamicAsyncGeneration": "Test Dynamic Async Generation", + "TestAsyncResourceUser": "Test Async Resource User", + "TestAsyncBatchProcessing": "Test Async Batch Processing", + "TestAsyncConcurrentLimit": "Test Async Concurrent Limit", +} diff --git a/tests/inference/testing_nodes/testing-pack/specific_tests.py b/tests/inference/testing_nodes/testing-pack/specific_tests.py index 9d05ab14f..657d49f2f 100644 --- a/tests/inference/testing_nodes/testing-pack/specific_tests.py +++ b/tests/inference/testing_nodes/testing-pack/specific_tests.py @@ -1,6 +1,11 @@ import torch +import time +import asyncio +from comfy.utils import ProgressBar from .tools import VariantSupport from comfy_execution.graph_utils import GraphBuilder +from comfy.comfy_types.node_typing import ComfyNodeABC +from comfy.comfy_types import IO class TestLazyMixImages: @classmethod @@ -333,6 +338,131 @@ class TestMixedExpansionReturns: "expand": g.finalize(), } +class TestSamplingInExpansion: + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "model": ("MODEL",), + "clip": ("CLIP",), + "vae": ("VAE",), + "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}), + "steps": ("INT", {"default": 20, "min": 1, "max": 100}), + "cfg": ("FLOAT", {"default": 7.0, "min": 0.0, "max": 30.0}), + "prompt": ("STRING", {"multiline": True, "default": "a beautiful landscape with mountains and trees"}), + "negative_prompt": ("STRING", {"multiline": True, "default": "blurry, bad quality, worst quality"}), + }, + } + + RETURN_TYPES = ("IMAGE",) + FUNCTION = "sampling_in_expansion" + + CATEGORY = "Testing/Nodes" + + def sampling_in_expansion(self, model, clip, vae, seed, steps, cfg, prompt, negative_prompt): + g = GraphBuilder() + + # Create a basic image generation workflow using the input model, clip and vae + # 1. Setup text prompts using the provided CLIP model + positive_prompt = g.node("CLIPTextEncode", + text=prompt, + clip=clip) + negative_prompt = g.node("CLIPTextEncode", + text=negative_prompt, + clip=clip) + + # 2. Create empty latent with specified size + empty_latent = g.node("EmptyLatentImage", width=512, height=512, batch_size=1) + + # 3. Setup sampler and generate image latent + sampler = g.node("KSampler", + model=model, + positive=positive_prompt.out(0), + negative=negative_prompt.out(0), + latent_image=empty_latent.out(0), + seed=seed, + steps=steps, + cfg=cfg, + sampler_name="euler_ancestral", + scheduler="normal") + + # 4. Decode latent to image using VAE + output = g.node("VAEDecode", samples=sampler.out(0), vae=vae) + + return { + "result": (output.out(0),), + "expand": g.finalize(), + } + +class TestSleep(ComfyNodeABC): + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "value": (IO.ANY, {}), + "seconds": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 9999.0, "step": 0.01, "tooltip": "The amount of seconds to sleep."}), + }, + "hidden": { + "unique_id": "UNIQUE_ID", + }, + } + RETURN_TYPES = (IO.ANY,) + FUNCTION = "sleep" + + CATEGORY = "_for_testing" + + async def sleep(self, value, seconds, unique_id): + pbar = ProgressBar(seconds, node_id=unique_id) + start = time.time() + expiration = start + seconds + now = start + while now < expiration: + now = time.time() + pbar.update_absolute(now - start) + await asyncio.sleep(0.01) + return (value,) + +class TestParallelSleep(ComfyNodeABC): + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "image1": ("IMAGE", ), + "image2": ("IMAGE", ), + "image3": ("IMAGE", ), + "sleep1": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 10.0, "step": 0.01}), + "sleep2": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 10.0, "step": 0.01}), + "sleep3": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 10.0, "step": 0.01}), + }, + "hidden": { + "unique_id": "UNIQUE_ID", + }, + } + RETURN_TYPES = ("IMAGE",) + FUNCTION = "parallel_sleep" + CATEGORY = "_for_testing" + OUTPUT_NODE = True + + def parallel_sleep(self, image1, image2, image3, sleep1, sleep2, sleep3, unique_id): + # Create a graph dynamically with three TestSleep nodes + g = GraphBuilder() + + # Create sleep nodes for each duration and image + sleep_node1 = g.node("TestSleep", value=image1, seconds=sleep1) + sleep_node2 = g.node("TestSleep", value=image2, seconds=sleep2) + sleep_node3 = g.node("TestSleep", value=image3, seconds=sleep3) + + # Blend the results using TestVariadicAverage + blend = g.node("TestVariadicAverage", + input1=sleep_node1.out(0), + input2=sleep_node2.out(0), + input3=sleep_node3.out(0)) + + return { + "result": (blend.out(0),), + "expand": g.finalize(), + } + TEST_NODE_CLASS_MAPPINGS = { "TestLazyMixImages": TestLazyMixImages, "TestVariadicAverage": TestVariadicAverage, @@ -345,6 +475,9 @@ TEST_NODE_CLASS_MAPPINGS = { "TestCustomValidation5": TestCustomValidation5, "TestDynamicDependencyCycle": TestDynamicDependencyCycle, "TestMixedExpansionReturns": TestMixedExpansionReturns, + "TestSamplingInExpansion": TestSamplingInExpansion, + "TestSleep": TestSleep, + "TestParallelSleep": TestParallelSleep, } TEST_NODE_DISPLAY_NAME_MAPPINGS = { @@ -359,4 +492,7 @@ TEST_NODE_DISPLAY_NAME_MAPPINGS = { "TestCustomValidation5": "Custom Validation 5", "TestDynamicDependencyCycle": "Dynamic Dependency Cycle", "TestMixedExpansionReturns": "Mixed Expansion Returns", + "TestSamplingInExpansion": "Sampling In Expansion", + "TestSleep": "Test Sleep", + "TestParallelSleep": "Test Parallel Sleep", } From b7ff5bd14def548519cc09770b794ad176868761 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 10 Jul 2025 12:21:18 -0700 Subject: [PATCH 017/759] Fix python3.9 (#8858) --- comfy_execution/progress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py index 5645b3e3c..73dba3f75 100644 --- a/comfy_execution/progress.py +++ b/comfy_execution/progress.py @@ -317,7 +317,7 @@ class ProgressRegistry: handler.reset() # Global registry instance -global_progress_registry: ProgressRegistry | None = None +global_progress_registry: ProgressRegistry = None def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt") -> None: global global_progress_registry From 8f05fb48eaab6b5d6fd0747c5be358a60d93e186 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Thu, 10 Jul 2025 15:00:29 -0700 Subject: [PATCH 018/759] [fix] increase Kling API polling timeout to prevent user timeouts (#8860) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends polling duration from 10 minutes to ~68 minutes (256 attempts × 16 seconds) to accommodate longer Kling API operations that were frequently timing out for users. --- comfy_api_nodes/nodes_kling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 641cd6353..69e9e5cf0 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -132,6 +132,8 @@ def poll_until_finished( result_url_extractor=result_url_extractor, estimated_duration=estimated_duration, node_id=node_id, + poll_interval=16.0, + max_poll_attempts=256, ).execute() From 938d3e8216b9936913f105593b6127428feef473 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 10 Jul 2025 23:37:51 -0700 Subject: [PATCH 019/759] Remove windows line endings. (#8866) --- comfy/ldm/pixart/pixartms.py | 512 +++++++++++++++---------------- comfy/text_encoders/pixart_t5.py | 84 ++--- 2 files changed, 298 insertions(+), 298 deletions(-) diff --git a/comfy/ldm/pixart/pixartms.py b/comfy/ldm/pixart/pixartms.py index 7d4eebdce..d1ac49d84 100644 --- a/comfy/ldm/pixart/pixartms.py +++ b/comfy/ldm/pixart/pixartms.py @@ -1,256 +1,256 @@ -# Based on: -# https://github.com/PixArt-alpha/PixArt-alpha [Apache 2.0 license] -# https://github.com/PixArt-alpha/PixArt-sigma [Apache 2.0 license] -import torch -import torch.nn as nn - -from .blocks import ( - t2i_modulate, - CaptionEmbedder, - AttentionKVCompress, - MultiHeadCrossAttention, - T2IFinalLayer, - SizeEmbedder, -) -from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, PatchEmbed, Mlp, get_1d_sincos_pos_embed_from_grid_torch - - -def get_2d_sincos_pos_embed_torch(embed_dim, w, h, pe_interpolation=1.0, base_size=16, device=None, dtype=torch.float32): - grid_h, grid_w = torch.meshgrid( - torch.arange(h, device=device, dtype=dtype) / (h/base_size) / pe_interpolation, - torch.arange(w, device=device, dtype=dtype) / (w/base_size) / pe_interpolation, - indexing='ij' - ) - emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_h, device=device, dtype=dtype) - emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_w, device=device, dtype=dtype) - emb = torch.cat([emb_w, emb_h], dim=1) # (H*W, D) - return emb - -class PixArtMSBlock(nn.Module): - """ - A PixArt block with adaptive layer norm zero (adaLN-Zero) conditioning. - """ - def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., input_size=None, - sampling=None, sr_ratio=1, qk_norm=False, dtype=None, device=None, operations=None, **block_kwargs): - super().__init__() - self.hidden_size = hidden_size - self.norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.attn = AttentionKVCompress( - hidden_size, num_heads=num_heads, qkv_bias=True, sampling=sampling, sr_ratio=sr_ratio, - qk_norm=qk_norm, dtype=dtype, device=device, operations=operations, **block_kwargs - ) - self.cross_attn = MultiHeadCrossAttention( - hidden_size, num_heads, dtype=dtype, device=device, operations=operations, **block_kwargs - ) - self.norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - # to be compatible with lower version pytorch - approx_gelu = lambda: nn.GELU(approximate="tanh") - self.mlp = Mlp( - in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, - dtype=dtype, device=device, operations=operations - ) - self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5) - - def forward(self, x, y, t, mask=None, HW=None, **kwargs): - B, N, C = x.shape - - shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None].to(dtype=x.dtype, device=x.device) + t.reshape(B, 6, -1)).chunk(6, dim=1) - x = x + (gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa), HW=HW)) - x = x + self.cross_attn(x, y, mask) - x = x + (gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) - - return x - - -### Core PixArt Model ### -class PixArtMS(nn.Module): - """ - Diffusion model with a Transformer backbone. - """ - def __init__( - self, - input_size=32, - patch_size=2, - in_channels=4, - hidden_size=1152, - depth=28, - num_heads=16, - mlp_ratio=4.0, - class_dropout_prob=0.1, - learn_sigma=True, - pred_sigma=True, - drop_path: float = 0., - caption_channels=4096, - pe_interpolation=None, - pe_precision=None, - config=None, - model_max_length=120, - micro_condition=True, - qk_norm=False, - kv_compress_config=None, - dtype=None, - device=None, - operations=None, - **kwargs, - ): - nn.Module.__init__(self) - self.dtype = dtype - self.pred_sigma = pred_sigma - self.in_channels = in_channels - self.out_channels = in_channels * 2 if pred_sigma else in_channels - self.patch_size = patch_size - self.num_heads = num_heads - self.pe_interpolation = pe_interpolation - self.pe_precision = pe_precision - self.hidden_size = hidden_size - self.depth = depth - - approx_gelu = lambda: nn.GELU(approximate="tanh") - self.t_block = nn.Sequential( - nn.SiLU(), - operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device) - ) - self.x_embedder = PatchEmbed( - patch_size=patch_size, - in_chans=in_channels, - embed_dim=hidden_size, - bias=True, - dtype=dtype, - device=device, - operations=operations - ) - self.t_embedder = TimestepEmbedder( - hidden_size, dtype=dtype, device=device, operations=operations, - ) - self.y_embedder = CaptionEmbedder( - in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, - act_layer=approx_gelu, token_num=model_max_length, - dtype=dtype, device=device, operations=operations, - ) - - self.micro_conditioning = micro_condition - if self.micro_conditioning: - self.csize_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations) - self.ar_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations) - - # For fixed sin-cos embedding: - # num_patches = (input_size // patch_size) * (input_size // patch_size) - # self.base_size = input_size // self.patch_size - # self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size)) - - drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule - if kv_compress_config is None: - kv_compress_config = { - 'sampling': None, - 'scale_factor': 1, - 'kv_compress_layer': [], - } - self.blocks = nn.ModuleList([ - PixArtMSBlock( - hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i], - sampling=kv_compress_config['sampling'], - sr_ratio=int(kv_compress_config['scale_factor']) if i in kv_compress_config['kv_compress_layer'] else 1, - qk_norm=qk_norm, - dtype=dtype, - device=device, - operations=operations, - ) - for i in range(depth) - ]) - self.final_layer = T2IFinalLayer( - hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations - ) - - def forward_orig(self, x, timestep, y, mask=None, c_size=None, c_ar=None, **kwargs): - """ - Original forward pass of PixArt. - x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) - t: (N,) tensor of diffusion timesteps - y: (N, 1, 120, C) conditioning - ar: (N, 1): aspect ratio - cs: (N ,2) size conditioning for height/width - """ - B, C, H, W = x.shape - c_res = (H + W) // 2 - pe_interpolation = self.pe_interpolation - if pe_interpolation is None or self.pe_precision is not None: - # calculate pe_interpolation on-the-fly - pe_interpolation = round(c_res / (512/8.0), self.pe_precision or 0) - - pos_embed = get_2d_sincos_pos_embed_torch( - self.hidden_size, - h=(H // self.patch_size), - w=(W // self.patch_size), - pe_interpolation=pe_interpolation, - base_size=((round(c_res / 64) * 64) // self.patch_size), - device=x.device, - dtype=x.dtype, - ).unsqueeze(0) - - x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2 - t = self.t_embedder(timestep, x.dtype) # (N, D) - - if self.micro_conditioning and (c_size is not None and c_ar is not None): - bs = x.shape[0] - c_size = self.csize_embedder(c_size, bs) # (N, D) - c_ar = self.ar_embedder(c_ar, bs) # (N, D) - t = t + torch.cat([c_size, c_ar], dim=1) - - t0 = self.t_block(t) - y = self.y_embedder(y, self.training) # (N, D) - - if mask is not None: - if mask.shape[0] != y.shape[0]: - mask = mask.repeat(y.shape[0] // mask.shape[0], 1) - mask = mask.squeeze(1).squeeze(1) - y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) - y_lens = mask.sum(dim=1).tolist() - else: - y_lens = None - y = y.squeeze(1).view(1, -1, x.shape[-1]) - for block in self.blocks: - x = block(x, y, t0, y_lens, (H, W), **kwargs) # (N, T, D) - - x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) - x = self.unpatchify(x, H, W) # (N, out_channels, H, W) - - return x - - def forward(self, x, timesteps, context, c_size=None, c_ar=None, **kwargs): - B, C, H, W = x.shape - - # Fallback for missing microconds - if self.micro_conditioning: - if c_size is None: - c_size = torch.tensor([H*8, W*8], dtype=x.dtype, device=x.device).repeat(B, 1) - - if c_ar is None: - c_ar = torch.tensor([H/W], dtype=x.dtype, device=x.device).repeat(B, 1) - - ## Still accepts the input w/o that dim but returns garbage - if len(context.shape) == 3: - context = context.unsqueeze(1) - - ## run original forward pass - out = self.forward_orig(x, timesteps, context, c_size=c_size, c_ar=c_ar) - - ## only return EPS - if self.pred_sigma: - return out[:, :self.in_channels] - return out - - def unpatchify(self, x, h, w): - """ - x: (N, T, patch_size**2 * C) - imgs: (N, H, W, C) - """ - c = self.out_channels - p = self.x_embedder.patch_size[0] - h = h // self.patch_size - w = w // self.patch_size - assert h * w == x.shape[1] - - x = x.reshape(shape=(x.shape[0], h, w, p, p, c)) - x = torch.einsum('nhwpqc->nchpwq', x) - imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p)) - return imgs +# Based on: +# https://github.com/PixArt-alpha/PixArt-alpha [Apache 2.0 license] +# https://github.com/PixArt-alpha/PixArt-sigma [Apache 2.0 license] +import torch +import torch.nn as nn + +from .blocks import ( + t2i_modulate, + CaptionEmbedder, + AttentionKVCompress, + MultiHeadCrossAttention, + T2IFinalLayer, + SizeEmbedder, +) +from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, PatchEmbed, Mlp, get_1d_sincos_pos_embed_from_grid_torch + + +def get_2d_sincos_pos_embed_torch(embed_dim, w, h, pe_interpolation=1.0, base_size=16, device=None, dtype=torch.float32): + grid_h, grid_w = torch.meshgrid( + torch.arange(h, device=device, dtype=dtype) / (h/base_size) / pe_interpolation, + torch.arange(w, device=device, dtype=dtype) / (w/base_size) / pe_interpolation, + indexing='ij' + ) + emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_h, device=device, dtype=dtype) + emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_w, device=device, dtype=dtype) + emb = torch.cat([emb_w, emb_h], dim=1) # (H*W, D) + return emb + +class PixArtMSBlock(nn.Module): + """ + A PixArt block with adaptive layer norm zero (adaLN-Zero) conditioning. + """ + def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., input_size=None, + sampling=None, sr_ratio=1, qk_norm=False, dtype=None, device=None, operations=None, **block_kwargs): + super().__init__() + self.hidden_size = hidden_size + self.norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) + self.attn = AttentionKVCompress( + hidden_size, num_heads=num_heads, qkv_bias=True, sampling=sampling, sr_ratio=sr_ratio, + qk_norm=qk_norm, dtype=dtype, device=device, operations=operations, **block_kwargs + ) + self.cross_attn = MultiHeadCrossAttention( + hidden_size, num_heads, dtype=dtype, device=device, operations=operations, **block_kwargs + ) + self.norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) + # to be compatible with lower version pytorch + approx_gelu = lambda: nn.GELU(approximate="tanh") + self.mlp = Mlp( + in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu, + dtype=dtype, device=device, operations=operations + ) + self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5) + + def forward(self, x, y, t, mask=None, HW=None, **kwargs): + B, N, C = x.shape + + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None].to(dtype=x.dtype, device=x.device) + t.reshape(B, 6, -1)).chunk(6, dim=1) + x = x + (gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa), HW=HW)) + x = x + self.cross_attn(x, y, mask) + x = x + (gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp))) + + return x + + +### Core PixArt Model ### +class PixArtMS(nn.Module): + """ + Diffusion model with a Transformer backbone. + """ + def __init__( + self, + input_size=32, + patch_size=2, + in_channels=4, + hidden_size=1152, + depth=28, + num_heads=16, + mlp_ratio=4.0, + class_dropout_prob=0.1, + learn_sigma=True, + pred_sigma=True, + drop_path: float = 0., + caption_channels=4096, + pe_interpolation=None, + pe_precision=None, + config=None, + model_max_length=120, + micro_condition=True, + qk_norm=False, + kv_compress_config=None, + dtype=None, + device=None, + operations=None, + **kwargs, + ): + nn.Module.__init__(self) + self.dtype = dtype + self.pred_sigma = pred_sigma + self.in_channels = in_channels + self.out_channels = in_channels * 2 if pred_sigma else in_channels + self.patch_size = patch_size + self.num_heads = num_heads + self.pe_interpolation = pe_interpolation + self.pe_precision = pe_precision + self.hidden_size = hidden_size + self.depth = depth + + approx_gelu = lambda: nn.GELU(approximate="tanh") + self.t_block = nn.Sequential( + nn.SiLU(), + operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device) + ) + self.x_embedder = PatchEmbed( + patch_size=patch_size, + in_chans=in_channels, + embed_dim=hidden_size, + bias=True, + dtype=dtype, + device=device, + operations=operations + ) + self.t_embedder = TimestepEmbedder( + hidden_size, dtype=dtype, device=device, operations=operations, + ) + self.y_embedder = CaptionEmbedder( + in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob, + act_layer=approx_gelu, token_num=model_max_length, + dtype=dtype, device=device, operations=operations, + ) + + self.micro_conditioning = micro_condition + if self.micro_conditioning: + self.csize_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations) + self.ar_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations) + + # For fixed sin-cos embedding: + # num_patches = (input_size // patch_size) * (input_size // patch_size) + # self.base_size = input_size // self.patch_size + # self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size)) + + drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)] # stochastic depth decay rule + if kv_compress_config is None: + kv_compress_config = { + 'sampling': None, + 'scale_factor': 1, + 'kv_compress_layer': [], + } + self.blocks = nn.ModuleList([ + PixArtMSBlock( + hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i], + sampling=kv_compress_config['sampling'], + sr_ratio=int(kv_compress_config['scale_factor']) if i in kv_compress_config['kv_compress_layer'] else 1, + qk_norm=qk_norm, + dtype=dtype, + device=device, + operations=operations, + ) + for i in range(depth) + ]) + self.final_layer = T2IFinalLayer( + hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations + ) + + def forward_orig(self, x, timestep, y, mask=None, c_size=None, c_ar=None, **kwargs): + """ + Original forward pass of PixArt. + x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) + t: (N,) tensor of diffusion timesteps + y: (N, 1, 120, C) conditioning + ar: (N, 1): aspect ratio + cs: (N ,2) size conditioning for height/width + """ + B, C, H, W = x.shape + c_res = (H + W) // 2 + pe_interpolation = self.pe_interpolation + if pe_interpolation is None or self.pe_precision is not None: + # calculate pe_interpolation on-the-fly + pe_interpolation = round(c_res / (512/8.0), self.pe_precision or 0) + + pos_embed = get_2d_sincos_pos_embed_torch( + self.hidden_size, + h=(H // self.patch_size), + w=(W // self.patch_size), + pe_interpolation=pe_interpolation, + base_size=((round(c_res / 64) * 64) // self.patch_size), + device=x.device, + dtype=x.dtype, + ).unsqueeze(0) + + x = self.x_embedder(x) + pos_embed # (N, T, D), where T = H * W / patch_size ** 2 + t = self.t_embedder(timestep, x.dtype) # (N, D) + + if self.micro_conditioning and (c_size is not None and c_ar is not None): + bs = x.shape[0] + c_size = self.csize_embedder(c_size, bs) # (N, D) + c_ar = self.ar_embedder(c_ar, bs) # (N, D) + t = t + torch.cat([c_size, c_ar], dim=1) + + t0 = self.t_block(t) + y = self.y_embedder(y, self.training) # (N, D) + + if mask is not None: + if mask.shape[0] != y.shape[0]: + mask = mask.repeat(y.shape[0] // mask.shape[0], 1) + mask = mask.squeeze(1).squeeze(1) + y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) + y_lens = mask.sum(dim=1).tolist() + else: + y_lens = None + y = y.squeeze(1).view(1, -1, x.shape[-1]) + for block in self.blocks: + x = block(x, y, t0, y_lens, (H, W), **kwargs) # (N, T, D) + + x = self.final_layer(x, t) # (N, T, patch_size ** 2 * out_channels) + x = self.unpatchify(x, H, W) # (N, out_channels, H, W) + + return x + + def forward(self, x, timesteps, context, c_size=None, c_ar=None, **kwargs): + B, C, H, W = x.shape + + # Fallback for missing microconds + if self.micro_conditioning: + if c_size is None: + c_size = torch.tensor([H*8, W*8], dtype=x.dtype, device=x.device).repeat(B, 1) + + if c_ar is None: + c_ar = torch.tensor([H/W], dtype=x.dtype, device=x.device).repeat(B, 1) + + ## Still accepts the input w/o that dim but returns garbage + if len(context.shape) == 3: + context = context.unsqueeze(1) + + ## run original forward pass + out = self.forward_orig(x, timesteps, context, c_size=c_size, c_ar=c_ar) + + ## only return EPS + if self.pred_sigma: + return out[:, :self.in_channels] + return out + + def unpatchify(self, x, h, w): + """ + x: (N, T, patch_size**2 * C) + imgs: (N, H, W, C) + """ + c = self.out_channels + p = self.x_embedder.patch_size[0] + h = h // self.patch_size + w = w // self.patch_size + assert h * w == x.shape[1] + + x = x.reshape(shape=(x.shape[0], h, w, p, p, c)) + x = torch.einsum('nhwpqc->nchpwq', x) + imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p)) + return imgs diff --git a/comfy/text_encoders/pixart_t5.py b/comfy/text_encoders/pixart_t5.py index b8de6bc4e..5f383de07 100644 --- a/comfy/text_encoders/pixart_t5.py +++ b/comfy/text_encoders/pixart_t5.py @@ -1,42 +1,42 @@ -import os - -from comfy import sd1_clip -import comfy.text_encoders.t5 -import comfy.text_encoders.sd3_clip -from comfy.sd1_clip import gen_empty_tokens - -from transformers import T5TokenizerFast - -class T5XXLModel(comfy.text_encoders.sd3_clip.T5XXLModel): - def __init__(self, **kwargs): - super().__init__(**kwargs) - - def gen_empty_tokens(self, special_tokens, *args, **kwargs): - # PixArt expects the negative to be all pad tokens - special_tokens = special_tokens.copy() - special_tokens.pop("end") - return gen_empty_tokens(special_tokens, *args, **kwargs) - -class PixArtT5XXL(sd1_clip.SD1ClipModel): - def __init__(self, device="cpu", dtype=None, model_options={}): - super().__init__(device=device, dtype=dtype, name="t5xxl", clip_model=T5XXLModel, model_options=model_options) - -class T5XXLTokenizer(sd1_clip.SDTokenizer): - def __init__(self, embedding_directory=None, tokenizer_data={}): - tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer") - super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data) # no padding - -class PixArtTokenizer(sd1_clip.SD1Tokenizer): - def __init__(self, embedding_directory=None, tokenizer_data={}): - super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer) - -def pixart_te(dtype_t5=None, t5xxl_scaled_fp8=None): - class PixArtTEModel_(PixArtT5XXL): - def __init__(self, device="cpu", dtype=None, model_options={}): - if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options: - model_options = model_options.copy() - model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 - if dtype is None: - dtype = dtype_t5 - super().__init__(device=device, dtype=dtype, model_options=model_options) - return PixArtTEModel_ +import os + +from comfy import sd1_clip +import comfy.text_encoders.t5 +import comfy.text_encoders.sd3_clip +from comfy.sd1_clip import gen_empty_tokens + +from transformers import T5TokenizerFast + +class T5XXLModel(comfy.text_encoders.sd3_clip.T5XXLModel): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def gen_empty_tokens(self, special_tokens, *args, **kwargs): + # PixArt expects the negative to be all pad tokens + special_tokens = special_tokens.copy() + special_tokens.pop("end") + return gen_empty_tokens(special_tokens, *args, **kwargs) + +class PixArtT5XXL(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + super().__init__(device=device, dtype=dtype, name="t5xxl", clip_model=T5XXLModel, model_options=model_options) + +class T5XXLTokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer") + super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data) # no padding + +class PixArtTokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer) + +def pixart_te(dtype_t5=None, t5xxl_scaled_fp8=None): + class PixArtTEModel_(PixArtT5XXL): + def __init__(self, device="cpu", dtype=None, model_options={}): + if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options: + model_options = model_options.copy() + model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 + if dtype is None: + dtype = dtype_t5 + super().__init__(device=device, dtype=dtype, model_options=model_options) + return PixArtTEModel_ From 7bc7dd2aa2030a87135a3c8d20f636d52627ca9d Mon Sep 17 00:00:00 2001 From: JettHu <35261585+JettHu@users.noreply.github.com> Date: Sat, 12 Jul 2025 00:51:06 +0800 Subject: [PATCH 020/759] Execute async node earlier (#8865) --- comfy_execution/graph.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index c79243e1e..60e2ab91e 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -3,6 +3,7 @@ from typing import Type, Literal import nodes import asyncio +import inspect from comfy_execution.graph_utils import is_link from comfy.comfy_types.node_typing import ComfyNodeABC, InputTypeDict, InputTypeOptions @@ -239,8 +240,15 @@ class ExecutionList(TopologicalSort): return True return False + # If an available node is async, do that first. + # This will execute the asynchronous function earlier, reducing the overall time. + def is_async(node_id): + class_type = self.dynprompt.get_node(node_id)["class_type"] + class_def = nodes.NODE_CLASS_MAPPINGS[class_type] + return inspect.iscoroutinefunction(getattr(class_def, class_def.FUNCTION)) + for node_id in node_list: - if is_output(node_id): + if is_output(node_id) or is_async(node_id): return node_id #This should handle the VAEDecode -> preview case From b43916a134d503bd5cab15dfa281f429527e448b Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Sat, 12 Jul 2025 00:52:58 +0800 Subject: [PATCH 021/759] Fix fresca's input and output (#8871) --- comfy_extras/nodes_fresca.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/comfy_extras/nodes_fresca.py b/comfy_extras/nodes_fresca.py index ee310c874..65c2d0d0e 100644 --- a/comfy_extras/nodes_fresca.py +++ b/comfy_extras/nodes_fresca.py @@ -71,8 +71,11 @@ class FreSca: DESCRIPTION = "Applies frequency-dependent scaling to the guidance" def patch(self, model, scale_low, scale_high, freq_cutoff): def custom_cfg_function(args): - cond = args["conds_out"][0] - uncond = args["conds_out"][1] + conds_out = args["conds_out"] + if len(conds_out) <= 1 or None in args["conds"][:2]: + return conds_out + cond = conds_out[0] + uncond = conds_out[1] guidance = cond - uncond filtered_guidance = Fourier_filter( @@ -83,7 +86,7 @@ class FreSca: ) filtered_cond = filtered_guidance + uncond - return [filtered_cond, uncond] + return [filtered_cond, uncond] + conds_out[2:] m = model.clone() m.set_model_sampler_pre_cfg_function(custom_cfg_function) From b40143984c1bed9cd1bc73a373a80a90c625eb0f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 12 Jul 2025 00:49:26 -0700 Subject: [PATCH 022/759] Add model detection error hint for lora. (#8880) --- comfy/sd.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index 5b95cf75a..8081b167c 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -18,6 +18,7 @@ import comfy.ldm.hunyuan3d.vae import comfy.ldm.ace.vae.music_dcae_pipeline import yaml import math +import os import comfy.utils @@ -977,6 +978,12 @@ def load_gligen(ckpt_path): model = model.half() return comfy.model_patcher.ModelPatcher(model, load_device=model_management.get_torch_device(), offload_device=model_management.unet_offload_device()) +def model_detection_error_hint(path, state_dict): + filename = os.path.basename(path) + if 'lora' in filename.lower(): + return "\nHINT: This seems to be a Lora file and Lora files should be put in the lora folder and loaded with a lora loader node.." + return "" + def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_clip=True, embedding_directory=None, state_dict=None, config=None): logging.warning("Warning: The load checkpoint with config function is deprecated and will eventually be removed, please use the other one.") model, clip, vae, _ = load_checkpoint_guess_config(ckpt_path, output_vae=output_vae, output_clip=output_clip, output_clipvision=False, embedding_directory=embedding_directory, output_model=True) @@ -1005,7 +1012,7 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True) out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata) if out is None: - raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path)) + raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(ckpt_path, model_detection_error_hint(ckpt_path, sd))) return out def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}, metadata=None): @@ -1177,7 +1184,7 @@ def load_diffusion_model(unet_path, model_options={}): model = load_diffusion_model_state_dict(sd, model_options=model_options) if model is None: logging.error("ERROR UNSUPPORTED DIFFUSION MODEL {}".format(unet_path)) - raise RuntimeError("ERROR: Could not detect model type of: {}".format(unet_path)) + raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(unet_path, model_detection_error_hint(unet_path, sd))) return model def load_unet(unet_path, dtype=None): From 480375f3495e9e1437faf47eb2a11222c9acf3f0 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Sun, 13 Jul 2025 01:46:27 -0700 Subject: [PATCH 023/759] Remove auth tokens from history storage (#8889) Remove auth_token_comfy_org and api_key_comfy_org from extra_data before storing prompt history to prevent sensitive authentication tokens from being persisted in the history endpoint response. --- execution.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/execution.py b/execution.py index 90cefc023..bd638afba 100644 --- a/execution.py +++ b/execution.py @@ -1045,6 +1045,12 @@ class PromptQueue: if status is not None: status_dict = copy.deepcopy(status._asdict()) + # Remove auth tokens from extra_data before storing in history + if "auth_token_comfy_org" in prompt[3]: + del prompt[3]["auth_token_comfy_org"] + if "api_key_comfy_org" in prompt[3]: + del prompt[3]["api_key_comfy_org"] + self.history[prompt[1]] = { "prompt": prompt, "outputs": {}, From 4831e9c2c47b97f85fd771521f247a017d1f43e1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 13 Jul 2025 01:59:17 -0700 Subject: [PATCH 024/759] Refactor previous pr. (#8893) --- execution.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/execution.py b/execution.py index bd638afba..c3a62f1cb 100644 --- a/execution.py +++ b/execution.py @@ -123,6 +123,8 @@ class CacheSet: } return result +SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org") + def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, extra_data={}): valid_inputs = class_def.INPUT_TYPES() input_data_all = {} @@ -1045,11 +1047,10 @@ class PromptQueue: if status is not None: status_dict = copy.deepcopy(status._asdict()) - # Remove auth tokens from extra_data before storing in history - if "auth_token_comfy_org" in prompt[3]: - del prompt[3]["auth_token_comfy_org"] - if "api_key_comfy_org" in prompt[3]: - del prompt[3]["api_key_comfy_org"] + # Remove sensitive data from extra_data before storing in history + for sensitive_val in SENSITIVE_EXTRA_DATA_KEYS: + if sensitive_val in prompt[3]: + prompt[3].pop(sensitive_val) self.history[prompt[1]] = { "prompt": prompt, From 9ca581c9416d799db0d7c55ac957a5fa486798c8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 14 Jul 2025 10:10:20 -0700 Subject: [PATCH 025/759] Remove windows line endings. (#8902) --- comfy_extras/nodes_pixart.py | 48 ++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/comfy_extras/nodes_pixart.py b/comfy_extras/nodes_pixart.py index c7209c468..8d9276afe 100644 --- a/comfy_extras/nodes_pixart.py +++ b/comfy_extras/nodes_pixart.py @@ -1,24 +1,24 @@ -from nodes import MAX_RESOLUTION - -class CLIPTextEncodePixArtAlpha: - @classmethod - def INPUT_TYPES(s): - return {"required": { - "width": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - "height": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - # "aspect_ratio": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "text": ("STRING", {"multiline": True, "dynamicPrompts": True}), "clip": ("CLIP", ), - }} - - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" - CATEGORY = "advanced/conditioning" - DESCRIPTION = "Encodes text and sets the resolution conditioning for PixArt Alpha. Does not apply to PixArt Sigma." - - def encode(self, clip, width, height, text): - tokens = clip.tokenize(text) - return (clip.encode_from_tokens_scheduled(tokens, add_dict={"width": width, "height": height}),) - -NODE_CLASS_MAPPINGS = { - "CLIPTextEncodePixArtAlpha": CLIPTextEncodePixArtAlpha, -} +from nodes import MAX_RESOLUTION + +class CLIPTextEncodePixArtAlpha: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "width": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), + "height": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), + # "aspect_ratio": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), + "text": ("STRING", {"multiline": True, "dynamicPrompts": True}), "clip": ("CLIP", ), + }} + + RETURN_TYPES = ("CONDITIONING",) + FUNCTION = "encode" + CATEGORY = "advanced/conditioning" + DESCRIPTION = "Encodes text and sets the resolution conditioning for PixArt Alpha. Does not apply to PixArt Sigma." + + def encode(self, clip, width, height, text): + tokens = clip.tokenize(text) + return (clip.encode_from_tokens_scheduled(tokens, add_dict={"width": width, "height": height}),) + +NODE_CLASS_MAPPINGS = { + "CLIPTextEncodePixArtAlpha": CLIPTextEncodePixArtAlpha, +} From 861c3bbb3d2330dc7dff7567ffcf07946ace23b8 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 15 Jul 2025 01:27:57 +0800 Subject: [PATCH 026/759] Upate template to 0.1.36 (#8904) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 19a40ca0e..7705918a8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.35 +comfyui-workflow-templates==0.1.36 comfyui-embedded-docs==0.2.4 torch torchsde From 260a5ca5d9997bfa1cec5a4922cb066187e6daf0 Mon Sep 17 00:00:00 2001 From: FeepingCreature <540727+FeepingCreature@users.noreply.github.com> Date: Mon, 14 Jul 2025 20:48:31 +0200 Subject: [PATCH 027/759] Allow the prompt request to specify the prompt ID. (#8189) This makes it easier to write asynchronous clients that submit requests, because they can store the task immediately. Duplicate prompt IDs are rejected by the job queue. --- script_examples/websockets_api_example.py | 11 ++++++----- server.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/script_examples/websockets_api_example.py b/script_examples/websockets_api_example.py index d696d2bba..58f26cfb6 100644 --- a/script_examples/websockets_api_example.py +++ b/script_examples/websockets_api_example.py @@ -10,11 +10,11 @@ import urllib.parse server_address = "127.0.0.1:8188" client_id = str(uuid.uuid4()) -def queue_prompt(prompt): - p = {"prompt": prompt, "client_id": client_id} +def queue_prompt(prompt, prompt_id): + p = {"prompt": prompt, "client_id": client_id, "prompt_id": prompt_id} data = json.dumps(p).encode('utf-8') - req = urllib.request.Request("http://{}/prompt".format(server_address), data=data) - return json.loads(urllib.request.urlopen(req).read()) + req = urllib.request.Request("http://{}/prompt".format(server_address), data=data) + urllib.request.urlopen(req).read() def get_image(filename, subfolder, folder_type): data = {"filename": filename, "subfolder": subfolder, "type": folder_type} @@ -27,7 +27,8 @@ def get_history(prompt_id): return json.loads(response.read()) def get_images(ws, prompt): - prompt_id = queue_prompt(prompt)['prompt_id'] + prompt_id = str(uuid.uuid4()) + queue_prompt(prompt, prompt_id) output_images = {} while True: out = ws.recv() diff --git a/server.py b/server.py index e8bad9f4e..71a58f0fa 100644 --- a/server.py +++ b/server.py @@ -678,7 +678,7 @@ class PromptServer(): if "prompt" in json_data: prompt = json_data["prompt"] - prompt_id = str(uuid.uuid4()) + prompt_id = str(json_data.get("prompt_id", uuid.uuid4())) valid = await execution.validate_prompt(prompt_id, prompt) extra_data = {} if "extra_data" in json_data: From 543c24108c565a7dc21920d2c509b7596020eac1 Mon Sep 17 00:00:00 2001 From: Yoland Yan <4950057+yoland68@users.noreply.github.com> Date: Mon, 14 Jul 2025 17:45:55 -0700 Subject: [PATCH 028/759] Fix wrong reference bug (#8910) --- comfy/supported_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 2669ca01e..2ca3857f7 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1214,7 +1214,7 @@ class Omnigen2(supported_models_base.BASE): def clip_target(self, state_dict={}): pref = self.text_encoder_key_prefix[0] hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref)) - return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.LuminaTokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect)) + return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect)) models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2] From 9dabda19f046796f1169b593188b9b1b9263705e Mon Sep 17 00:00:00 2001 From: Yoland Yan <4950057+yoland68@users.noreply.github.com> Date: Mon, 14 Jul 2025 17:59:35 -0700 Subject: [PATCH 029/759] Update nodes_gemini.py (#8912) --- comfy_api_nodes/nodes_gemini.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index ae7b04846..5935ab2bb 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -406,7 +406,7 @@ class GeminiInputFiles(ComfyNodeABC): def create_file_part(self, file_path: str) -> GeminiPart: mime_type = ( - GeminiMimeType.pdf + GeminiMimeType.application_pdf if file_path.endswith(".pdf") else GeminiMimeType.text_plain ) From b1ae4126c39dd2a2831d4b88f91c7d18bc307ef7 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 14 Jul 2025 23:27:18 -0700 Subject: [PATCH 030/759] Add action to detect windows line endings. (#8917) --- .github/workflows/check-line-endings.yml | 40 ++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/check-line-endings.yml diff --git a/.github/workflows/check-line-endings.yml b/.github/workflows/check-line-endings.yml new file mode 100644 index 000000000..f20dca565 --- /dev/null +++ b/.github/workflows/check-line-endings.yml @@ -0,0 +1,40 @@ +name: Check for Windows Line Endings + +on: + pull_request: + branches: ['*'] # Trigger on all pull requests to any branch + +jobs: + check-line-endings: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch all history to compare changes + + - name: Check for Windows line endings (CRLF) + run: | + # Get the list of changed files in the PR + CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}..HEAD) + + # Flag to track if CRLF is found + CRLF_FOUND=false + + # Loop through each changed file + for FILE in $CHANGED_FILES; do + # Check if the file exists and is a text file + if [ -f "$FILE" ] && file "$FILE" | grep -q "text"; then + # Check for CRLF line endings + if grep -UP '\r$' "$FILE"; then + echo "Error: Windows line endings (CRLF) detected in $FILE" + CRLF_FOUND=true + fi + fi + done + + # Exit with error if CRLF was found + if [ "$CRLF_FOUND" = true ]; then + exit 1 + fi From 6b8062f4141f6655c4cea5422c29aa0a8b532cdf Mon Sep 17 00:00:00 2001 From: Brandon Wallace Date: Tue, 15 Jul 2025 20:08:27 -0500 Subject: [PATCH 031/759] Fix MaskComposite error when destination has 2 dimensions (#8915) Fix code that is using the original `destination` input instead of the reshaped value. --- comfy_extras/nodes_mask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index ab387a2fc..2b0f8dd5d 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -247,7 +247,7 @@ class MaskComposite: visible_width, visible_height = (right - left, bottom - top,) source_portion = source[:, :visible_height, :visible_width] - destination_portion = destination[:, top:bottom, left:right] + destination_portion = output[:, top:bottom, left:right] if operation == "multiply": output[:, top:bottom, left:right] = destination_portion * source_portion From 50afba747cd3413a6f6eb6703c627a24d2e6f165 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 16 Jul 2025 00:42:17 -0700 Subject: [PATCH 032/759] Add attempt to work around the safetensors mmap issue. (#8928) --- comfy/cli_args.py | 1 + comfy/utils.py | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 7234a7ba0..ef0d4337e 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -144,6 +144,7 @@ class PerformanceFeature(enum.Enum): parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops") parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.") +parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.") parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.") parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.") diff --git a/comfy/utils.py b/comfy/utils.py index f8e01f713..9c076a0e0 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -31,6 +31,7 @@ from einops import rearrange from comfy.cli_args import args MMAP_TORCH_FILES = args.mmap_torch_files +DISABLE_MMAP = args.disable_mmap ALWAYS_SAFE_LOAD = False if hasattr(torch.serialization, "add_safe_globals"): # TODO: this was added in pytorch 2.4, the unsafe path should be removed once earlier versions are deprecated @@ -58,7 +59,10 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False): with safetensors.safe_open(ckpt, framework="pt", device=device.type) as f: sd = {} for k in f.keys(): - sd[k] = f.get_tensor(k) + tensor = f.get_tensor(k) + if DISABLE_MMAP: # TODO: Not sure if this is the best way to bypass the mmap issues + tensor = tensor.to(device=device, copy=True) + sd[k] = tensor if return_metadata: metadata = f.metadata() except Exception as e: From 9bc2798f72947d6b76b3650c88a7b2f0afc1a8d0 Mon Sep 17 00:00:00 2001 From: Harel Cain Date: Wed, 16 Jul 2025 19:54:38 +0200 Subject: [PATCH 033/759] LTXV VAE decoder: switch default padding mode (#8930) --- comfy/ldm/lightricks/vae/causal_video_autoencoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py index f91870d71..75ed069ad 100644 --- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py +++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py @@ -973,7 +973,7 @@ class VideoVAE(nn.Module): norm_layer=config.get("norm_layer", "group_norm"), causal=config.get("causal_decoder", False), timestep_conditioning=self.timestep_conditioning, - spatial_padding_mode=config.get("spatial_padding_mode", "zeros"), + spatial_padding_mode=config.get("spatial_padding_mode", "reflect"), ) self.per_channel_statistics = processor() From 491fafbd6428e59c52093d76fde0bc10dfa723bb Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 16 Jul 2025 11:42:07 -0700 Subject: [PATCH 034/759] Silence clip tokenizer warning. (#8934) --- comfy/sd1_tokenizer/tokenizer_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/sd1_tokenizer/tokenizer_config.json b/comfy/sd1_tokenizer/tokenizer_config.json index 5ba7bf706..8f7b3151d 100644 --- a/comfy/sd1_tokenizer/tokenizer_config.json +++ b/comfy/sd1_tokenizer/tokenizer_config.json @@ -18,7 +18,7 @@ "single_word": false }, "errors": "replace", - "model_max_length": 77, + "model_max_length": 8192, "name_or_path": "openai/clip-vit-large-patch14", "pad_token": "<|endoftext|>", "special_tokens_map_file": "./special_tokens_map.json", From 650838fd6fcb24e66bf82b3c75383f35433d7b8a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 17 Jul 2025 01:11:07 -0700 Subject: [PATCH 035/759] Experimental CFGNorm node. (#8942) This is from the new hidream e1 1 model code. Figured it might be useful as a generic cfg trick. --- comfy_extras/nodes_cfg.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_cfg.py b/comfy_extras/nodes_cfg.py index 1fb686644..1acaf15bc 100644 --- a/comfy_extras/nodes_cfg.py +++ b/comfy_extras/nodes_cfg.py @@ -40,6 +40,32 @@ class CFGZeroStar: m.set_model_sampler_post_cfg_function(cfg_zero_star) return (m, ) +class CFGNorm: + @classmethod + def INPUT_TYPES(s): + return {"required": {"model": ("MODEL",), + "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}), + }} + RETURN_TYPES = ("MODEL",) + RETURN_NAMES = ("patched_model",) + FUNCTION = "patch" + CATEGORY = "advanced/guidance" + + def patch(self, model, strength): + m = model.clone() + def cfg_norm(args): + cond_p = args['cond_denoised'] + pred_text_ = args["denoised"] + + norm_full_cond = torch.norm(cond_p, dim=1, keepdim=True) + norm_pred_text = torch.norm(pred_text_, dim=1, keepdim=True) + scale = (norm_full_cond / (norm_pred_text + 1e-8)).clamp(min=0.0, max=1.0) + return pred_text_ * scale * strength + + m.set_model_sampler_post_cfg_function(cfg_norm) + return (m, ) + NODE_CLASS_MAPPINGS = { - "CFGZeroStar": CFGZeroStar + "CFGZeroStar": CFGZeroStar, + "CFGNorm": CFGNorm, } From 7f492522b6dcb142ff2c4d3438310773d9a80551 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 18 Jul 2025 02:43:02 -0700 Subject: [PATCH 036/759] Forgot this (#8957) --- comfy_extras/nodes_cfg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy_extras/nodes_cfg.py b/comfy_extras/nodes_cfg.py index 1acaf15bc..5abdc115a 100644 --- a/comfy_extras/nodes_cfg.py +++ b/comfy_extras/nodes_cfg.py @@ -50,6 +50,7 @@ class CFGNorm: RETURN_NAMES = ("patched_model",) FUNCTION = "patch" CATEGORY = "advanced/guidance" + EXPERIMENTAL = True def patch(self, model, strength): m = model.clone() From 1b96fae1d4a8425c44d4d3bd60acd818d05bf4f6 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 19 Jul 2025 01:55:23 -0700 Subject: [PATCH 037/759] Add nested style of dual cfg to DualCFGGuider node. (#8965) --- comfy_extras/nodes_custom_sampler.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index 33bc41842..d17737e1a 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -683,9 +683,10 @@ class CFGGuider: return (guider,) class Guider_DualCFG(comfy.samplers.CFGGuider): - def set_cfg(self, cfg1, cfg2): + def set_cfg(self, cfg1, cfg2, nested=False): self.cfg1 = cfg1 self.cfg2 = cfg2 + self.nested = nested def set_conds(self, positive, middle, negative): middle = node_helpers.conditioning_set_values(middle, {"prompt_type": "negative"}) @@ -695,14 +696,20 @@ class Guider_DualCFG(comfy.samplers.CFGGuider): negative_cond = self.conds.get("negative", None) middle_cond = self.conds.get("middle", None) positive_cond = self.conds.get("positive", None) - if model_options.get("disable_cfg1_optimization", False) == False: - if math.isclose(self.cfg2, 1.0): - negative_cond = None - if math.isclose(self.cfg1, 1.0): - middle_cond = None - out = comfy.samplers.calc_cond_batch(self.inner_model, [negative_cond, middle_cond, positive_cond], x, timestep, model_options) - return comfy.samplers.cfg_function(self.inner_model, out[1], out[0], self.cfg2, x, timestep, model_options=model_options, cond=middle_cond, uncond=negative_cond) + (out[2] - out[1]) * self.cfg1 + if self.nested: + out = comfy.samplers.calc_cond_batch(self.inner_model, [negative_cond, middle_cond, positive_cond], x, timestep, model_options) + pred_text = comfy.samplers.cfg_function(self.inner_model, out[2], out[1], self.cfg1, x, timestep, model_options=model_options, cond=positive_cond, uncond=middle_cond) + return out[0] + self.cfg2 * (pred_text - out[0]) + else: + if model_options.get("disable_cfg1_optimization", False) == False: + if math.isclose(self.cfg2, 1.0): + negative_cond = None + if math.isclose(self.cfg1, 1.0): + middle_cond = None + + out = comfy.samplers.calc_cond_batch(self.inner_model, [negative_cond, middle_cond, positive_cond], x, timestep, model_options) + return comfy.samplers.cfg_function(self.inner_model, out[1], out[0], self.cfg2, x, timestep, model_options=model_options, cond=middle_cond, uncond=negative_cond) + (out[2] - out[1]) * self.cfg1 class DualCFGGuider: @classmethod @@ -714,6 +721,7 @@ class DualCFGGuider: "negative": ("CONDITIONING", ), "cfg_conds": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}), "cfg_cond2_negative": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}), + "style": (["regular", "nested"],), } } @@ -722,10 +730,10 @@ class DualCFGGuider: FUNCTION = "get_guider" CATEGORY = "sampling/custom_sampling/guiders" - def get_guider(self, model, cond1, cond2, negative, cfg_conds, cfg_cond2_negative): + def get_guider(self, model, cond1, cond2, negative, cfg_conds, cfg_cond2_negative, style): guider = Guider_DualCFG(model) guider.set_conds(cond1, cond2, negative) - guider.set_cfg(cfg_conds, cfg_cond2_negative) + guider.set_cfg(cfg_conds, cfg_cond2_negative, nested=(style == "nested")) return (guider,) class DisableNoise: From 1da5639e865a50f921d870a92c7c87110ce20c48 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sat, 19 Jul 2025 18:08:00 +0800 Subject: [PATCH 038/759] Update template to 0.1.37 (#8967) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7705918a8..a7e44095f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.36 +comfyui-workflow-templates==0.1.37 comfyui-embedded-docs==0.2.4 torch torchsde From 100c2478eaba71ab735539fdc00c9d0de49bc224 Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Sun, 20 Jul 2025 11:09:11 +0800 Subject: [PATCH 039/759] Add SamplingPercentToSigma node (#8963) It's helpful to adjust start_percent or end_percent based on the corresponding sigma. --- comfy_extras/nodes_custom_sampler.py | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index d17737e1a..d011f433b 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -301,6 +301,35 @@ class ExtendIntermediateSigmas: return (extended_sigmas,) + +class SamplingPercentToSigma: + @classmethod + def INPUT_TYPES(cls) -> InputTypeDict: + return { + "required": { + "model": (IO.MODEL, {}), + "sampling_percent": (IO.FLOAT, {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.0001}), + "return_actual_sigma": (IO.BOOLEAN, {"default": False, "tooltip": "Return the actual sigma value instead of the value used for interval checks.\nThis only affects results at 0.0 and 1.0."}), + } + } + + RETURN_TYPES = (IO.FLOAT,) + RETURN_NAMES = ("sigma_value",) + CATEGORY = "sampling/custom_sampling/sigmas" + + FUNCTION = "get_sigma" + + def get_sigma(self, model, sampling_percent, return_actual_sigma): + model_sampling = model.get_model_object("model_sampling") + sigma_val = model_sampling.percent_to_sigma(sampling_percent) + if return_actual_sigma: + if sampling_percent == 0.0: + sigma_val = model_sampling.sigma_max.item() + elif sampling_percent == 1.0: + sigma_val = model_sampling.sigma_min.item() + return (sigma_val,) + + class KSamplerSelect: @classmethod def INPUT_TYPES(s): @@ -887,6 +916,7 @@ NODE_CLASS_MAPPINGS = { "FlipSigmas": FlipSigmas, "SetFirstSigma": SetFirstSigma, "ExtendIntermediateSigmas": ExtendIntermediateSigmas, + "SamplingPercentToSigma": SamplingPercentToSigma, "CFGGuider": CFGGuider, "DualCFGGuider": DualCFGGuider, From a0c0785635a9f4d2da64b58fef063825f386d8da Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 19 Jul 2025 22:24:09 -0700 Subject: [PATCH 040/759] Document what the fast_fp16_accumulation is in the portable. (#8973) --- .ci/windows_base_files/README_VERY_IMPORTANT.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.ci/windows_base_files/README_VERY_IMPORTANT.txt b/.ci/windows_base_files/README_VERY_IMPORTANT.txt index d46acbcbf..8ab70c890 100755 --- a/.ci/windows_base_files/README_VERY_IMPORTANT.txt +++ b/.ci/windows_base_files/README_VERY_IMPORTANT.txt @@ -4,6 +4,9 @@ if you have a NVIDIA gpu: run_nvidia_gpu.bat +if you want to enable the fast fp16 accumulation (faster for fp16 models with slightly less quality): + +run_nvidia_gpu_fast_fp16_accumulation.bat To run it in slow CPU mode: From 7d627f764c2137d816a39adbc358cb28c1718a47 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Mon, 21 Jul 2025 03:58:35 +0800 Subject: [PATCH 041/759] Update template to 0.1.39 (#8981) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a7e44095f..8f6a6d112 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.37 +comfyui-workflow-templates==0.1.39 comfyui-embedded-docs==0.2.4 torch torchsde From 9a470e073e2742d4edd6e7ea1ce28d861a77d9c4 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 21 Jul 2025 14:05:43 -0400 Subject: [PATCH 042/759] ComfyUI version 0.3.45 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 7981fbaca..180ecaf8a 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.44" +__version__ = "0.3.45" diff --git a/pyproject.toml b/pyproject.toml index 96ead2157..b1d6d9df6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.44" +version = "0.3.45" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 54a45b996729b361ea12f473de760e481dcf1f0a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 21 Jul 2025 11:19:14 -0700 Subject: [PATCH 043/759] Replace torchaudio.load with pyav. (#8989) --- comfy_extras/nodes_audio.py | 58 ++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 8cd647846..38697240e 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -278,6 +278,62 @@ class PreviewAudio(SaveAudio): "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, } +def f32_pcm(wav: torch.Tensor) -> torch.Tensor: + """Convert audio to float 32 bits PCM format.""" + if wav.dtype.is_floating_point: + return wav + elif wav.dtype == torch.int16: + return wav.float() / (2 ** 15) + elif wav.dtype == torch.int32: + return wav.float() / (2 ** 31) + raise ValueError(f"Unsupported wav dtype: {wav.dtype}") + +def load(filepath: str, frame_offset: int = 0, num_frames: int = -1) -> tuple[torch.Tensor, int]: + with av.open(filepath) as af: + if not af.streams.audio: + raise ValueError("No audio stream found in the file.") + + stream = af.streams.audio[0] + sr = stream.codec_context.sample_rate + n_channels = stream.channels + + seek_time = frame_offset / sr if frame_offset > 0 else 0.0 + duration = num_frames / sr if num_frames > 0 else -1.0 + + sample_offset = int(sr * seek_time) + num_samples = int(sr * duration) if duration >= 0 else -1 + + # Small negative offset for MP3 artifacts, NOTE: this is LLM code so idk if it's actually necessary' + seek_sec = max(0, seek_time - 0.1) if filepath.lower().endswith('.mp3') else seek_time + af.seek(int(seek_sec / stream.time_base), stream=stream) + + frames = [] + length = 0 + for frame in af.decode(streams=stream.index): + current_offset = int(frame.rate * frame.pts * frame.time_base) + strip = max(0, sample_offset - current_offset) + + buf = torch.from_numpy(frame.to_ndarray()) + if buf.shape[0] != n_channels: + buf = buf.view(-1, n_channels).t() + + buf = buf[:, strip:] + frames.append(buf) + length += buf.shape[1] + + if num_samples > 0 and length >= num_samples: + break + + if not frames: + raise ValueError("No audio frames decoded.") + + wav = torch.cat(frames, dim=1) + if num_samples > 0: + wav = wav[:, :num_samples] + + wav = f32_pcm(wav) + return wav, sr + class LoadAudio: @classmethod def INPUT_TYPES(s): @@ -292,7 +348,7 @@ class LoadAudio: def load(self, audio): audio_path = folder_paths.get_annotated_filepath(audio) - waveform, sample_rate = torchaudio.load(audio_path) + waveform, sample_rate = load(audio_path) audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate} return (audio, ) From 5249e45a1c7d91656ebefdebe3815005ec3d39d7 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 21 Jul 2025 12:23:41 -0700 Subject: [PATCH 044/759] Add hidream e1.1 example to readme. (#8990) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0e021a687..d004364ee 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - Image Editing Models - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/) - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model) + - [HiDream E1.1](https://comfyanonymous.github.io/ComfyUI_examples/hidream/#hidream-e11) - Video Models - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/) - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/) From 0aa1c58b04b27311c6ba38b1d9949e7e20037d00 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 21 Jul 2025 13:48:25 -0700 Subject: [PATCH 045/759] This is not needed. (#8991) --- comfy_extras/nodes_audio.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 38697240e..a90b31779 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -288,7 +288,7 @@ def f32_pcm(wav: torch.Tensor) -> torch.Tensor: return wav.float() / (2 ** 31) raise ValueError(f"Unsupported wav dtype: {wav.dtype}") -def load(filepath: str, frame_offset: int = 0, num_frames: int = -1) -> tuple[torch.Tensor, int]: +def load(filepath: str) -> tuple[torch.Tensor, int]: with av.open(filepath) as af: if not af.streams.audio: raise ValueError("No audio stream found in the file.") @@ -297,40 +297,20 @@ def load(filepath: str, frame_offset: int = 0, num_frames: int = -1) -> tuple[to sr = stream.codec_context.sample_rate n_channels = stream.channels - seek_time = frame_offset / sr if frame_offset > 0 else 0.0 - duration = num_frames / sr if num_frames > 0 else -1.0 - - sample_offset = int(sr * seek_time) - num_samples = int(sr * duration) if duration >= 0 else -1 - - # Small negative offset for MP3 artifacts, NOTE: this is LLM code so idk if it's actually necessary' - seek_sec = max(0, seek_time - 0.1) if filepath.lower().endswith('.mp3') else seek_time - af.seek(int(seek_sec / stream.time_base), stream=stream) - frames = [] length = 0 for frame in af.decode(streams=stream.index): - current_offset = int(frame.rate * frame.pts * frame.time_base) - strip = max(0, sample_offset - current_offset) - buf = torch.from_numpy(frame.to_ndarray()) if buf.shape[0] != n_channels: buf = buf.view(-1, n_channels).t() - buf = buf[:, strip:] frames.append(buf) length += buf.shape[1] - if num_samples > 0 and length >= num_samples: - break - if not frames: raise ValueError("No audio frames decoded.") wav = torch.cat(frames, dim=1) - if num_samples > 0: - wav = wav[:, :num_samples] - wav = f32_pcm(wav) return wav, sr From 5ac9ec214ba3ef1632701416f27948a57ec60919 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 22 Jul 2025 01:07:51 -0700 Subject: [PATCH 046/759] Try to fix line endings workflow. (#9001) --- .github/workflows/check-line-endings.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/check-line-endings.yml b/.github/workflows/check-line-endings.yml index f20dca565..03b3e3ced 100644 --- a/.github/workflows/check-line-endings.yml +++ b/.github/workflows/check-line-endings.yml @@ -17,6 +17,7 @@ jobs: - name: Check for Windows line endings (CRLF) run: | # Get the list of changed files in the PR + git merge origin/${{ github.base_ref }} --no-edit CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}..HEAD) # Flag to track if CRLF is found From 255f1398638b265a47d0e74fb4759fe6cfc3b3d4 Mon Sep 17 00:00:00 2001 From: Simon Lui <502929+simonlui@users.noreply.github.com> Date: Tue, 22 Jul 2025 12:20:09 -0700 Subject: [PATCH 047/759] Add xpu version for async offload and some other things. (#9004) --- comfy/model_management.py | 41 +++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 816caf18f..ab1e9bf3a 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -101,7 +101,7 @@ if args.directml is not None: lowvram_available = False #TODO: need to find a way to get free memory in directml before this can be enabled by default. try: - import intel_extension_for_pytorch as ipex + import intel_extension_for_pytorch as ipex # noqa: F401 _ = torch.xpu.device_count() xpu_available = xpu_available or torch.xpu.is_available() except: @@ -186,8 +186,12 @@ def get_total_memory(dev=None, torch_total_too=False): elif is_intel_xpu(): stats = torch.xpu.memory_stats(dev) mem_reserved = stats['reserved_bytes.all.current'] + if torch_version_numeric < (2, 6): + mem_total_xpu = torch.xpu.get_device_properties(dev).total_memory + else: + _, mem_total_xpu = torch.xpu.mem_get_info(dev) mem_total_torch = mem_reserved - mem_total = torch.xpu.get_device_properties(dev).total_memory + mem_total = mem_total_xpu elif is_ascend_npu(): stats = torch.npu.memory_stats(dev) mem_reserved = stats['reserved_bytes.all.current'] @@ -929,7 +933,7 @@ def device_supports_non_blocking(device): if is_device_mps(device): return False #pytorch bug? mps doesn't support non blocking if is_intel_xpu(): - return False + return True if args.deterministic: #TODO: figure out why deterministic breaks non blocking from gpu to cpu (previews) return False if directml_enabled: @@ -968,6 +972,8 @@ def get_offload_stream(device): stream_counter = (stream_counter + 1) % len(ss) if is_device_cuda(device): ss[stream_counter].wait_stream(torch.cuda.current_stream()) + elif is_device_xpu(device): + ss[stream_counter].wait_stream(torch.xpu.current_stream()) stream_counters[device] = stream_counter return s elif is_device_cuda(device): @@ -979,6 +985,15 @@ def get_offload_stream(device): stream_counter = (stream_counter + 1) % len(ss) stream_counters[device] = stream_counter return s + elif is_device_xpu(device): + ss = [] + for k in range(NUM_STREAMS): + ss.append(torch.xpu.Stream(device=device, priority=0)) + STREAMS[device] = ss + s = ss[stream_counter] + stream_counter = (stream_counter + 1) % len(ss) + stream_counters[device] = stream_counter + return s return None def sync_stream(device, stream): @@ -986,6 +1001,8 @@ def sync_stream(device, stream): return if is_device_cuda(device): torch.cuda.current_stream().wait_stream(stream) + elif is_device_xpu(device): + torch.xpu.current_stream().wait_stream(stream) def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None): if device is None or weight.device == device: @@ -1092,8 +1109,11 @@ def get_free_memory(dev=None, torch_free_too=False): stats = torch.xpu.memory_stats(dev) mem_active = stats['active_bytes.all.current'] mem_reserved = stats['reserved_bytes.all.current'] + if torch_version_numeric < (2, 6): + mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved + else: + mem_free_xpu, _ = torch.xpu.mem_get_info(dev) mem_free_torch = mem_reserved - mem_active - mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved mem_free_total = mem_free_xpu + mem_free_torch elif is_ascend_npu(): stats = torch.npu.memory_stats(dev) @@ -1142,6 +1162,9 @@ def is_device_cpu(device): def is_device_mps(device): return is_device_type(device, 'mps') +def is_device_xpu(device): + return is_device_type(device, 'xpu') + def is_device_cuda(device): return is_device_type(device, 'cuda') @@ -1173,7 +1196,10 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma return False if is_intel_xpu(): - return True + if torch_version_numeric < (2, 3): + return True + else: + return torch.xpu.get_device_properties(device).has_fp16 if is_ascend_npu(): return True @@ -1236,7 +1262,10 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma return False if is_intel_xpu(): - return True + if torch_version_numeric < (2, 6): + return True + else: + return torch.xpu.get_device_capability(device)['has_bfloat16_conversions'] if is_ascend_npu(): return True From 5ad33787dee43d36f8d054c590818b3153b55370 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 23 Jul 2025 11:20:49 -0700 Subject: [PATCH 048/759] Add default device argument. (#9023) --- comfy/cli_args.py | 3 ++- comfy/model_management.py | 1 + main.py | 9 +++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index ef0d4337e..0d760d524 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -49,7 +49,8 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.") parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.") parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.") -parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.") +parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use. All other devices will not be visible.") +parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.") cm_group = parser.add_mutually_exclusive_group() cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).") cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.") diff --git a/comfy/model_management.py b/comfy/model_management.py index ab1e9bf3a..346673895 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -880,6 +880,7 @@ def vae_dtype(device=None, allowed_dtypes=[]): return d # NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32 + # slowness still a problem on pytorch nightly 2.9.0.dev20250720+rocm6.4 tested on RDNA3 if d == torch.bfloat16 and (not is_amd()) and should_use_bf16(device): return d diff --git a/main.py b/main.py index 2b4ffafd4..e8ca8152a 100644 --- a/main.py +++ b/main.py @@ -115,6 +115,15 @@ if os.name == "nt": logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage()) if __name__ == "__main__": + if args.default_device is not None: + default_dev = args.default_device + devices = list(range(32)) + devices.remove(default_dev) + devices.insert(0, default_dev) + devices = ','.join(map(str, devices)) + os.environ['CUDA_VISIBLE_DEVICES'] = str(devices) + os.environ['HIP_VISIBLE_DEVICES'] = str(devices) + if args.cuda_device is not None: os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda_device) os.environ['HIP_VISIBLE_DEVICES'] = str(args.cuda_device) From 39dda1d40d1f2f18ccda8ade860932d0b8a07af4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 23 Jul 2025 15:10:59 -0700 Subject: [PATCH 049/759] Fix xpu function not implemented. (#9026) --- comfy/model_management.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 346673895..746b063ed 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -186,10 +186,7 @@ def get_total_memory(dev=None, torch_total_too=False): elif is_intel_xpu(): stats = torch.xpu.memory_stats(dev) mem_reserved = stats['reserved_bytes.all.current'] - if torch_version_numeric < (2, 6): - mem_total_xpu = torch.xpu.get_device_properties(dev).total_memory - else: - _, mem_total_xpu = torch.xpu.mem_get_info(dev) + mem_total_xpu = torch.xpu.get_device_properties(dev).total_memory mem_total_torch = mem_reserved mem_total = mem_total_xpu elif is_ascend_npu(): From a86a58c308c2423e86054462a8c9f1125536a034 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 23 Jul 2025 15:18:20 -0700 Subject: [PATCH 050/759] Fix xpu function not implemented p2. (#9027) --- comfy/model_management.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 746b063ed..42873d09b 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1107,10 +1107,7 @@ def get_free_memory(dev=None, torch_free_too=False): stats = torch.xpu.memory_stats(dev) mem_active = stats['active_bytes.all.current'] mem_reserved = stats['reserved_bytes.all.current'] - if torch_version_numeric < (2, 6): - mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved - else: - mem_free_xpu, _ = torch.xpu.mem_get_info(dev) + mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved mem_free_torch = mem_reserved - mem_active mem_free_total = mem_free_xpu + mem_free_torch elif is_ascend_npu(): From d3504e1778c0cc8992b04fe30dc0fae239c13713 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 23 Jul 2025 16:21:29 -0700 Subject: [PATCH 051/759] Enable pytorch attention by default for gfx1201 on torch 2.8 (#9029) --- comfy/model_management.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 42873d09b..e8b9b5c81 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -308,7 +308,10 @@ try: logging.info("ROCm version: {}".format(rocm_version)) if args.use_split_cross_attention == False and args.use_quad_cross_attention == False: if torch_version_numeric >= (2, 7): # works on 2.6 but doesn't actually seem to improve much - if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx1201 and gfx950 + if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx950 + ENABLE_PYTORCH_ATTENTION = True + if torch_version_numeric >= (2, 8): + if any((a in arch) for a in ["gfx1201"]): ENABLE_PYTORCH_ATTENTION = True if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4): if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]): # TODO: more arches From e78d2304966b6265fa2320b4d87dca534ea15642 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 23 Jul 2025 16:37:43 -0700 Subject: [PATCH 052/759] Only enable cuda malloc on cuda torch. (#9031) --- cuda_malloc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_malloc.py b/cuda_malloc.py index eb2857c5f..c1d9ae3ca 100644 --- a/cuda_malloc.py +++ b/cuda_malloc.py @@ -74,7 +74,8 @@ if not args.cuda_malloc: module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) version = module.__version__ - if int(version[0]) >= 2: #enable by default for torch version 2.0 and up + + if int(version[0]) >= 2 and "+cu" in version: #enable by default for torch version 2.0 and up only on cuda torch args.cuda_malloc = cuda_malloc_supported() except: pass From e729a5cc1157bc0ece7daae9583c3a5a3ba95fbb Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Thu, 24 Jul 2025 07:47:05 +0800 Subject: [PATCH 053/759] Separate denoised and noise estimation in Euler CFG++ (#9008) This will change their behavior with the sampling CONST type. It also combines euler_cfg_pp and euler_ancestral_cfg_pp into one main function. --- comfy/k_diffusion/sampling.py | 64 +++++++++++++++++------------------ 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index 2ed415b1f..a2bc492fd 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -1210,39 +1210,21 @@ def sample_deis(model, x, sigmas, extra_args=None, callback=None, disable=None, return x_next -@torch.no_grad() -def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None): - extra_args = {} if extra_args is None else extra_args - - temp = [0] - def post_cfg_function(args): - temp[0] = args["uncond_denoised"] - return args["denoised"] - - model_options = extra_args.get("model_options", {}).copy() - extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True) - - s_in = x.new_ones([x.shape[0]]) - for i in trange(len(sigmas) - 1, disable=disable): - sigma_hat = sigmas[i] - denoised = model(x, sigma_hat * s_in, **extra_args) - d = to_d(x, sigma_hat, temp[0]) - if callback is not None: - callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised}) - # Euler method - x = denoised + d * sigmas[i + 1] - return x - @torch.no_grad() def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): - """Ancestral sampling with Euler method steps.""" + """Ancestral sampling with Euler method steps (CFG++).""" extra_args = {} if extra_args is None else extra_args seed = extra_args.get("seed", None) noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler - temp = [0] + model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling") + lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling) + + uncond_denoised = None + def post_cfg_function(args): - temp[0] = args["uncond_denoised"] + nonlocal uncond_denoised + uncond_denoised = args["uncond_denoised"] return args["denoised"] model_options = extra_args.get("model_options", {}).copy() @@ -1251,15 +1233,33 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No s_in = x.new_ones([x.shape[0]]) for i in trange(len(sigmas) - 1, disable=disable): denoised = model(x, sigmas[i] * s_in, **extra_args) - sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta) if callback is not None: callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) - d = to_d(x, sigmas[i], temp[0]) - # Euler method - x = denoised + d * sigma_down - if sigmas[i + 1] > 0: - x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up + if sigmas[i + 1] == 0: + # Denoising step + x = denoised + else: + alpha_s = sigmas[i] * lambda_fn(sigmas[i]).exp() + alpha_t = sigmas[i + 1] * lambda_fn(sigmas[i + 1]).exp() + d = to_d(x, sigmas[i], alpha_s * uncond_denoised) # to noise + + # DDIM stochastic sampling + sigma_down, sigma_up = get_ancestral_step(sigmas[i] / alpha_s, sigmas[i + 1] / alpha_t, eta=eta) + sigma_down = alpha_t * sigma_down + + # Euler method + x = alpha_t * denoised + sigma_down * d + if eta > 0 and s_noise > 0: + x = x + alpha_t * noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up return x + + +@torch.no_grad() +def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None): + """Euler method steps (CFG++).""" + return sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=0.0, s_noise=0.0, noise_sampler=None) + + @torch.no_grad() def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): """Ancestral sampling with DPM-Solver++(2S) second-order steps.""" From eb2f78b4e09b1970e2fc51fc5d2e062f1a826399 Mon Sep 17 00:00:00 2001 From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> Date: Thu, 24 Jul 2025 08:57:27 +0800 Subject: [PATCH 054/759] [Training Node] algo support, grad acc, optional grad ckpt (#9015) * Add factorization utils for lokr * Add lokr train impl * Add loha train impl * Add adapter map for algo selection * Add optional grad ckpt and algo selection * Update __init__.py * correct key name for loha * Use custom fwd/bwd func and better init for loha * Support gradient accumulation * Fix bugs of loha * use more stable init * Add OFT training * linting --- comfy/weight_adapter/__init__.py | 13 ++- comfy/weight_adapter/base.py | 40 +++++++++ comfy/weight_adapter/loha.py | 134 ++++++++++++++++++++++++++++++- comfy/weight_adapter/lokr.py | 86 +++++++++++++++++++- comfy/weight_adapter/oft.py | 67 +++++++++++++++- comfy_extras/nodes_train.py | 47 ++++++++--- 6 files changed, 372 insertions(+), 15 deletions(-) diff --git a/comfy/weight_adapter/__init__.py b/comfy/weight_adapter/__init__.py index 560b82be3..b40f920e4 100644 --- a/comfy/weight_adapter/__init__.py +++ b/comfy/weight_adapter/__init__.py @@ -15,9 +15,20 @@ adapters: list[type[WeightAdapterBase]] = [ OFTAdapter, BOFTAdapter, ] +adapter_maps: dict[str, type[WeightAdapterBase]] = { + "LoRA": LoRAAdapter, + "LoHa": LoHaAdapter, + "LoKr": LoKrAdapter, + "OFT": OFTAdapter, + ## We disable not implemented algo for now + # "GLoRA": GLoRAAdapter, + # "BOFT": BOFTAdapter, +} + __all__ = [ "WeightAdapterBase", "WeightAdapterTrainBase", - "adapters" + "adapters", + "adapter_maps", ] + [a.__name__ for a in adapters] diff --git a/comfy/weight_adapter/base.py b/comfy/weight_adapter/base.py index b5c7db423..43644b106 100644 --- a/comfy/weight_adapter/base.py +++ b/comfy/weight_adapter/base.py @@ -133,3 +133,43 @@ def tucker_weight_from_conv(up, down, mid): def tucker_weight(wa, wb, t): temp = torch.einsum("i j ..., j r -> i r ...", t, wb) return torch.einsum("i j ..., i r -> r j ...", temp, wa) + + +def factorization(dimension: int, factor: int = -1) -> tuple[int, int]: + """ + return a tuple of two value of input dimension decomposed by the number closest to factor + second value is higher or equal than first value. + + examples) + factor + -1 2 4 8 16 ... + 127 -> 1, 127 127 -> 1, 127 127 -> 1, 127 127 -> 1, 127 127 -> 1, 127 + 128 -> 8, 16 128 -> 2, 64 128 -> 4, 32 128 -> 8, 16 128 -> 8, 16 + 250 -> 10, 25 250 -> 2, 125 250 -> 2, 125 250 -> 5, 50 250 -> 10, 25 + 360 -> 8, 45 360 -> 2, 180 360 -> 4, 90 360 -> 8, 45 360 -> 12, 30 + 512 -> 16, 32 512 -> 2, 256 512 -> 4, 128 512 -> 8, 64 512 -> 16, 32 + 1024 -> 32, 32 1024 -> 2, 512 1024 -> 4, 256 1024 -> 8, 128 1024 -> 16, 64 + """ + + if factor > 0 and (dimension % factor) == 0 and dimension >= factor**2: + m = factor + n = dimension // factor + if m > n: + n, m = m, n + return m, n + if factor < 0: + factor = dimension + m, n = 1, dimension + length = m + n + while m < n: + new_m = m + 1 + while dimension % new_m != 0: + new_m += 1 + new_n = dimension // new_m + if new_m + new_n > length or new_m > factor: + break + else: + m, n = new_m, new_n + if m > n: + n, m = m, n + return m, n diff --git a/comfy/weight_adapter/loha.py b/comfy/weight_adapter/loha.py index ce79abad5..55c97a3af 100644 --- a/comfy/weight_adapter/loha.py +++ b/comfy/weight_adapter/loha.py @@ -3,7 +3,120 @@ from typing import Optional import torch import comfy.model_management -from .base import WeightAdapterBase, weight_decompose +from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose + + +class HadaWeight(torch.autograd.Function): + @staticmethod + def forward(ctx, w1u, w1d, w2u, w2d, scale=torch.tensor(1)): + ctx.save_for_backward(w1d, w1u, w2d, w2u, scale) + diff_weight = ((w1u @ w1d) * (w2u @ w2d)) * scale + return diff_weight + + @staticmethod + def backward(ctx, grad_out): + (w1d, w1u, w2d, w2u, scale) = ctx.saved_tensors + grad_out = grad_out * scale + temp = grad_out * (w2u @ w2d) + grad_w1u = temp @ w1d.T + grad_w1d = w1u.T @ temp + + temp = grad_out * (w1u @ w1d) + grad_w2u = temp @ w2d.T + grad_w2d = w2u.T @ temp + + del temp + return grad_w1u, grad_w1d, grad_w2u, grad_w2d, None + + +class HadaWeightTucker(torch.autograd.Function): + @staticmethod + def forward(ctx, t1, w1u, w1d, t2, w2u, w2d, scale=torch.tensor(1)): + ctx.save_for_backward(t1, w1d, w1u, t2, w2d, w2u, scale) + + rebuild1 = torch.einsum("i j ..., j r, i p -> p r ...", t1, w1d, w1u) + rebuild2 = torch.einsum("i j ..., j r, i p -> p r ...", t2, w2d, w2u) + + return rebuild1 * rebuild2 * scale + + @staticmethod + def backward(ctx, grad_out): + (t1, w1d, w1u, t2, w2d, w2u, scale) = ctx.saved_tensors + grad_out = grad_out * scale + + temp = torch.einsum("i j ..., j r -> i r ...", t2, w2d) + rebuild = torch.einsum("i j ..., i r -> r j ...", temp, w2u) + + grad_w = rebuild * grad_out + del rebuild + + grad_w1u = torch.einsum("r j ..., i j ... -> r i", temp, grad_w) + grad_temp = torch.einsum("i j ..., i r -> r j ...", grad_w, w1u.T) + del grad_w, temp + + grad_w1d = torch.einsum("i r ..., i j ... -> r j", t1, grad_temp) + grad_t1 = torch.einsum("i j ..., j r -> i r ...", grad_temp, w1d.T) + del grad_temp + + temp = torch.einsum("i j ..., j r -> i r ...", t1, w1d) + rebuild = torch.einsum("i j ..., i r -> r j ...", temp, w1u) + + grad_w = rebuild * grad_out + del rebuild + + grad_w2u = torch.einsum("r j ..., i j ... -> r i", temp, grad_w) + grad_temp = torch.einsum("i j ..., i r -> r j ...", grad_w, w2u.T) + del grad_w, temp + + grad_w2d = torch.einsum("i r ..., i j ... -> r j", t2, grad_temp) + grad_t2 = torch.einsum("i j ..., j r -> i r ...", grad_temp, w2d.T) + del grad_temp + return grad_t1, grad_w1u, grad_w1d, grad_t2, grad_w2u, grad_w2d, None + + +class LohaDiff(WeightAdapterTrainBase): + def __init__(self, weights): + super().__init__() + # Unpack weights tuple from LoHaAdapter + w1a, w1b, alpha, w2a, w2b, t1, t2, _ = weights + + # Create trainable parameters + self.hada_w1_a = torch.nn.Parameter(w1a) + self.hada_w1_b = torch.nn.Parameter(w1b) + self.hada_w2_a = torch.nn.Parameter(w2a) + self.hada_w2_b = torch.nn.Parameter(w2b) + + self.use_tucker = False + if t1 is not None and t2 is not None: + self.use_tucker = True + self.hada_t1 = torch.nn.Parameter(t1) + self.hada_t2 = torch.nn.Parameter(t2) + else: + # Keep the attributes for consistent access + self.hada_t1 = None + self.hada_t2 = None + + # Store rank and non-trainable alpha + self.rank = w1b.shape[0] + self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False) + + def __call__(self, w): + org_dtype = w.dtype + + scale = self.alpha / self.rank + if self.use_tucker: + diff_weight = HadaWeightTucker.apply(self.hada_t1, self.hada_w1_a, self.hada_w1_b, self.hada_t2, self.hada_w2_a, self.hada_w2_b, scale) + else: + diff_weight = HadaWeight.apply(self.hada_w1_a, self.hada_w1_b, self.hada_w2_a, self.hada_w2_b, scale) + + # Add the scaled difference to the original weight + weight = w.to(diff_weight) + diff_weight.reshape(w.shape) + + return weight.to(org_dtype) + + def passive_memory_usage(self): + """Calculates memory usage of the trainable parameters.""" + return sum(param.numel() * param.element_size() for param in self.parameters()) class LoHaAdapter(WeightAdapterBase): @@ -13,6 +126,25 @@ class LoHaAdapter(WeightAdapterBase): self.loaded_keys = loaded_keys self.weights = weights + @classmethod + def create_train(cls, weight, rank=1, alpha=1.0): + out_dim = weight.shape[0] + in_dim = weight.shape[1:].numel() + mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype) + mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype) + torch.nn.init.normal_(mat1, 0.1) + torch.nn.init.constant_(mat2, 0.0) + mat3 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype) + mat4 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype) + torch.nn.init.normal_(mat3, 0.1) + torch.nn.init.normal_(mat4, 0.01) + return LohaDiff( + (mat1, mat2, alpha, mat3, mat4, None, None, None) + ) + + def to_train(self): + return LohaDiff(self.weights) + @classmethod def load( cls, diff --git a/comfy/weight_adapter/lokr.py b/comfy/weight_adapter/lokr.py index 51233db2d..49b0be55f 100644 --- a/comfy/weight_adapter/lokr.py +++ b/comfy/weight_adapter/lokr.py @@ -3,7 +3,77 @@ from typing import Optional import torch import comfy.model_management -from .base import WeightAdapterBase, weight_decompose +from .base import ( + WeightAdapterBase, + WeightAdapterTrainBase, + weight_decompose, + factorization, +) + + +class LokrDiff(WeightAdapterTrainBase): + def __init__(self, weights): + super().__init__() + (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2, dora_scale) = weights + self.use_tucker = False + if lokr_w1_a is not None: + _, rank_a = lokr_w1_a.shape[0], lokr_w1_a.shape[1] + rank_a, _ = lokr_w1_b.shape[0], lokr_w1_b.shape[1] + self.lokr_w1_a = torch.nn.Parameter(lokr_w1_a) + self.lokr_w1_b = torch.nn.Parameter(lokr_w1_b) + self.w1_rebuild = True + self.ranka = rank_a + + if lokr_w2_a is not None: + _, rank_b = lokr_w2_a.shape[0], lokr_w2_a.shape[1] + rank_b, _ = lokr_w2_b.shape[0], lokr_w2_b.shape[1] + self.lokr_w2_a = torch.nn.Parameter(lokr_w2_a) + self.lokr_w2_b = torch.nn.Parameter(lokr_w2_b) + if lokr_t2 is not None: + self.use_tucker = True + self.lokr_t2 = torch.nn.Parameter(lokr_t2) + self.w2_rebuild = True + self.rankb = rank_b + + if lokr_w1 is not None: + self.lokr_w1 = torch.nn.Parameter(lokr_w1) + self.w1_rebuild = False + + if lokr_w2 is not None: + self.lokr_w2 = torch.nn.Parameter(lokr_w2) + self.w2_rebuild = False + + self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False) + + @property + def w1(self): + if self.w1_rebuild: + return (self.lokr_w1_a @ self.lokr_w1_b) * (self.alpha / self.ranka) + else: + return self.lokr_w1 + + @property + def w2(self): + if self.w2_rebuild: + if self.use_tucker: + w2 = torch.einsum( + 'i j k l, j r, i p -> p r k l', + self.lokr_t2, + self.lokr_w2_b, + self.lokr_w2_a + ) + else: + w2 = self.lokr_w2_a @ self.lokr_w2_b + return w2 * (self.alpha / self.rankb) + else: + return self.lokr_w2 + + def __call__(self, w): + diff = torch.kron(self.w1, self.w2) + return w + diff.reshape(w.shape).to(w) + + def passive_memory_usage(self): + return sum(param.numel() * param.element_size() for param in self.parameters()) class LoKrAdapter(WeightAdapterBase): @@ -13,6 +83,20 @@ class LoKrAdapter(WeightAdapterBase): self.loaded_keys = loaded_keys self.weights = weights + @classmethod + def create_train(cls, weight, rank=1, alpha=1.0): + out_dim = weight.shape[0] + in_dim = weight.shape[1:].numel() + out1, out2 = factorization(out_dim, rank) + in1, in2 = factorization(in_dim, rank) + mat1 = torch.empty(out1, in1, device=weight.device, dtype=weight.dtype) + mat2 = torch.empty(out2, in2, device=weight.device, dtype=weight.dtype) + torch.nn.init.kaiming_uniform_(mat2, a=5**0.5) + torch.nn.init.constant_(mat1, 0.0) + return LokrDiff( + (mat1, mat2, alpha, None, None, None, None, None, None) + ) + @classmethod def load( cls, diff --git a/comfy/weight_adapter/oft.py b/comfy/weight_adapter/oft.py index 25009eca3..9d4982083 100644 --- a/comfy/weight_adapter/oft.py +++ b/comfy/weight_adapter/oft.py @@ -3,7 +3,58 @@ from typing import Optional import torch import comfy.model_management -from .base import WeightAdapterBase, weight_decompose +from .base import WeightAdapterBase, WeightAdapterTrainBase, weight_decompose, factorization + + +class OFTDiff(WeightAdapterTrainBase): + def __init__(self, weights): + super().__init__() + # Unpack weights tuple from LoHaAdapter + blocks, rescale, alpha, _ = weights + + # Create trainable parameters + self.oft_blocks = torch.nn.Parameter(blocks) + if rescale is not None: + self.rescale = torch.nn.Parameter(rescale) + self.rescaled = True + else: + self.rescaled = False + self.block_num, self.block_size, _ = blocks.shape + self.constraint = float(alpha) + self.alpha = torch.nn.Parameter(torch.tensor(alpha), requires_grad=False) + + def __call__(self, w): + org_dtype = w.dtype + I = torch.eye(self.block_size, device=self.oft_blocks.device) + + ## generate r + # for Q = -Q^T + q = self.oft_blocks - self.oft_blocks.transpose(1, 2) + normed_q = q + if self.constraint: + q_norm = torch.norm(q) + 1e-8 + if q_norm > self.constraint: + normed_q = q * self.constraint / q_norm + # use float() to prevent unsupported type + r = (I + normed_q) @ (I - normed_q).float().inverse() + + ## Apply chunked matmul on weight + _, *shape = w.shape + org_weight = w.to(dtype=r.dtype) + org_weight = org_weight.unflatten(0, (self.block_num, self.block_size)) + # Init R=0, so add I on it to ensure the output of step0 is original model output + weight = torch.einsum( + "k n m, k n ... -> k m ...", + r, + org_weight, + ).flatten(0, 1) + if self.rescaled: + weight = self.rescale * weight + return weight.to(org_dtype) + + def passive_memory_usage(self): + """Calculates memory usage of the trainable parameters.""" + return sum(param.numel() * param.element_size() for param in self.parameters()) class OFTAdapter(WeightAdapterBase): @@ -13,6 +64,18 @@ class OFTAdapter(WeightAdapterBase): self.loaded_keys = loaded_keys self.weights = weights + @classmethod + def create_train(cls, weight, rank=1, alpha=1.0): + out_dim = weight.shape[0] + block_size, block_num = factorization(out_dim, rank) + block = torch.zeros(block_num, block_size, block_size, device=weight.device, dtype=weight.dtype) + return OFTDiff( + (block, None, alpha, None) + ) + + def to_train(self): + return OFTDiff(self.weights) + @classmethod def load( cls, @@ -60,6 +123,8 @@ class OFTAdapter(WeightAdapterBase): blocks = v[0] rescale = v[1] alpha = v[2] + if alpha is None: + alpha = 0 dora_scale = v[3] blocks = comfy.model_management.cast_to_device(blocks, weight.device, intermediate_dtype) diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index 3d05fdab5..c3aaaee9b 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -20,7 +20,7 @@ import folder_paths import node_helpers from comfy.cli_args import args from comfy.comfy_types.node_typing import IO -from comfy.weight_adapter import adapters +from comfy.weight_adapter import adapters, adapter_maps def make_batch_extra_option_dict(d, indicies, full_size=None): @@ -39,13 +39,13 @@ def make_batch_extra_option_dict(d, indicies, full_size=None): class TrainSampler(comfy.samplers.Sampler): - - def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, total_steps=1, seed=0, training_dtype=torch.bfloat16): + def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, grad_acc=1, total_steps=1, seed=0, training_dtype=torch.bfloat16): self.loss_fn = loss_fn self.optimizer = optimizer self.loss_callback = loss_callback self.batch_size = batch_size self.total_steps = total_steps + self.grad_acc = grad_acc self.seed = seed self.training_dtype = training_dtype @@ -92,8 +92,9 @@ class TrainSampler(comfy.samplers.Sampler): self.loss_callback(loss.item()) pbar.set_postfix({"loss": f"{loss.item():.4f}"}) - self.optimizer.step() - self.optimizer.zero_grad() + if (i+1) % self.grad_acc == 0: + self.optimizer.step() + self.optimizer.zero_grad() torch.cuda.empty_cache() return torch.zeros_like(latent_image) @@ -419,6 +420,16 @@ class TrainLoraNode: "tooltip": "The batch size to use for training.", }, ), + "grad_accumulation_steps": ( + IO.INT, + { + "default": 1, + "min": 1, + "max": 1024, + "step": 1, + "tooltip": "The number of gradient accumulation steps to use for training.", + } + ), "steps": ( IO.INT, { @@ -478,6 +489,17 @@ class TrainLoraNode: ["bf16", "fp32"], {"default": "bf16", "tooltip": "The dtype to use for lora."}, ), + "algorithm": ( + list(adapter_maps.keys()), + {"default": list(adapter_maps.keys())[0], "tooltip": "The algorithm to use for training."}, + ), + "gradient_checkpointing": ( + IO.BOOLEAN, + { + "default": True, + "tooltip": "Use gradient checkpointing for training.", + } + ), "existing_lora": ( folder_paths.get_filename_list("loras") + ["[None]"], { @@ -501,6 +523,7 @@ class TrainLoraNode: positive, batch_size, steps, + grad_accumulation_steps, learning_rate, rank, optimizer, @@ -508,6 +531,8 @@ class TrainLoraNode: seed, training_dtype, lora_dtype, + algorithm, + gradient_checkpointing, existing_lora, ): mp = model.clone() @@ -558,10 +583,8 @@ class TrainLoraNode: if existing_adapter is not None: break else: - # If no existing adapter found, use LoRA - # We will add algo option in the future existing_adapter = None - adapter_cls = adapters[0] + adapter_cls = adapter_maps[algorithm] if existing_adapter is not None: train_adapter = existing_adapter.to_train().to(lora_dtype) @@ -615,8 +638,9 @@ class TrainLoraNode: criterion = torch.nn.SmoothL1Loss() # setup models - for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model): - patch(m) + if gradient_checkpointing: + for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model): + patch(m) mp.model.requires_grad_(False) comfy.model_management.load_models_gpu([mp], memory_required=1e20, force_full_load=True) @@ -629,7 +653,8 @@ class TrainLoraNode: optimizer, loss_callback=loss_callback, batch_size=batch_size, - total_steps=steps, + grad_acc=grad_accumulation_steps, + total_steps=steps*grad_accumulation_steps, seed=seed, training_dtype=dtype ) From 0ccc88b03fbe190135e24ac04612565f8f0756b4 Mon Sep 17 00:00:00 2001 From: honglyua Date: Fri, 25 Jul 2025 01:57:36 +0800 Subject: [PATCH 055/759] Support Iluvatar CoreX (#8585) * Support Iluvatar CoreX Co-authored-by: mingjiang.li --- README.md | 7 +++++++ comfy/model_management.py | 23 ++++++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d004364ee..a148623cd 100644 --- a/README.md +++ b/README.md @@ -294,6 +294,13 @@ For models compatible with Cambricon Extension for PyTorch (torch_mlu). Here's a 2. Next, install the PyTorch(torch_mlu) following the instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cambricon_pytorch_1.17.0/user_guide_1.9/index.html) 3. Launch ComfyUI by running `python main.py` +#### Iluvatar Corex + +For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step guide tailored to your platform and installation method: + +1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536) +2. Launch ComfyUI by running `python main.py` + # Running ```python main.py``` diff --git a/comfy/model_management.py b/comfy/model_management.py index e8b9b5c81..9add54ceb 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -128,6 +128,11 @@ try: except: mlu_available = False +try: + ixuca_available = hasattr(torch, "corex") +except: + ixuca_available = False + if args.cpu: cpu_state = CPUState.CPU @@ -151,6 +156,12 @@ def is_mlu(): return True return False +def is_ixuca(): + global ixuca_available + if ixuca_available: + return True + return False + def get_torch_device(): global directml_enabled global cpu_state @@ -289,7 +300,7 @@ try: if torch_version_numeric[0] >= 2: if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False: ENABLE_PYTORCH_ATTENTION = True - if is_intel_xpu() or is_ascend_npu() or is_mlu(): + if is_intel_xpu() or is_ascend_npu() or is_mlu() or is_ixuca(): if args.use_split_cross_attention == False and args.use_quad_cross_attention == False: ENABLE_PYTORCH_ATTENTION = True except: @@ -1045,6 +1056,8 @@ def xformers_enabled(): return False if is_mlu(): return False + if is_ixuca(): + return False if directml_enabled: return False return XFORMERS_IS_AVAILABLE @@ -1080,6 +1093,8 @@ def pytorch_attention_flash_attention(): return True if is_amd(): return True #if you have pytorch attention enabled on AMD it probably supports at least mem efficient attention + if is_ixuca(): + return True return False def force_upcast_attention_dtype(): @@ -1205,6 +1220,9 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma if is_mlu(): return True + if is_ixuca(): + return True + if torch.version.hip: return True @@ -1268,6 +1286,9 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma if is_ascend_npu(): return True + if is_ixuca(): + return True + if is_amd(): arch = torch.cuda.get_device_properties(device).gcnArchName if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]): # RDNA2 and older don't support bf16 From d03ae077b4330f58e7caba53ff94e7fd58d0dc7d Mon Sep 17 00:00:00 2001 From: SHIVANSH GUPTA <121501003+shivansh-gupta4@users.noreply.github.com> Date: Thu, 24 Jul 2025 23:35:54 +0530 Subject: [PATCH 056/759] Added parameter required_frontend_version in the /system_stats API response (#8875) * Added the parameter required_frontend_version in the /system_stats api response * Update server.py * Created a function get_required_frontend_version and wrote tests for it * Refactored the function to return currently installed frontend pacakage version * Moved required_frontend to a new function and imported that in server.py * Corrected test cases using mocking techniques * Corrected files to comply with ruff formatting --- app/frontend_management.py | 47 +++++++++++++++++--- server.py | 2 + tests-unit/app_test/frontend_manager_test.py | 35 ++++++++++++++- 3 files changed, 77 insertions(+), 7 deletions(-) diff --git a/app/frontend_management.py b/app/frontend_management.py index 001ebbecb..0bee73685 100644 --- a/app/frontend_management.py +++ b/app/frontend_management.py @@ -29,18 +29,48 @@ def frontend_install_warning_message(): This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead. """.strip() +def parse_version(version: str) -> tuple[int, int, int]: + return tuple(map(int, version.split("."))) + +def is_valid_version(version: str) -> bool: + """Validate if a string is a valid semantic version (X.Y.Z format).""" + pattern = r"^(\d+)\.(\d+)\.(\d+)$" + return bool(re.match(pattern, version)) + +def get_installed_frontend_version(): + """Get the currently installed frontend package version.""" + frontend_version_str = version("comfyui-frontend-package") + return frontend_version_str + +def get_required_frontend_version(): + """Get the required frontend version from requirements.txt.""" + try: + with open(requirements_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line.startswith("comfyui-frontend-package=="): + version_str = line.split("==")[-1] + if not is_valid_version(version_str): + logging.error(f"Invalid version format in requirements.txt: {version_str}") + return None + return version_str + logging.error("comfyui-frontend-package not found in requirements.txt") + return None + except FileNotFoundError: + logging.error("requirements.txt not found. Cannot determine required frontend version.") + return None + except Exception as e: + logging.error(f"Error reading requirements.txt: {e}") + return None def check_frontend_version(): """Check if the frontend version is up to date.""" - def parse_version(version: str) -> tuple[int, int, int]: - return tuple(map(int, version.split("."))) - try: - frontend_version_str = version("comfyui-frontend-package") + frontend_version_str = get_installed_frontend_version() frontend_version = parse_version(frontend_version_str) - with open(requirements_path, "r", encoding="utf-8") as f: - required_frontend = parse_version(f.readline().split("=")[-1]) + required_frontend_str = get_required_frontend_version() + required_frontend = parse_version(required_frontend_str) if frontend_version < required_frontend: app.logger.log_startup_warning( f""" @@ -168,6 +198,11 @@ def download_release_asset_zip(release: Release, destination_path: str) -> None: class FrontendManager: CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions") + @classmethod + def get_required_frontend_version(cls) -> str: + """Get the required frontend package version.""" + return get_required_frontend_version() + @classmethod def default_frontend_path(cls) -> str: try: diff --git a/server.py b/server.py index 71a58f0fa..f4de0079b 100644 --- a/server.py +++ b/server.py @@ -553,6 +553,7 @@ class PromptServer(): ram_free = comfy.model_management.get_free_memory(cpu_device) vram_total, torch_vram_total = comfy.model_management.get_total_memory(device, torch_total_too=True) vram_free, torch_vram_free = comfy.model_management.get_free_memory(device, torch_free_too=True) + required_frontend_version = FrontendManager.get_required_frontend_version() system_stats = { "system": { @@ -560,6 +561,7 @@ class PromptServer(): "ram_total": ram_total, "ram_free": ram_free, "comfyui_version": __version__, + "required_frontend_version": required_frontend_version, "python_version": sys.version, "pytorch_version": comfy.model_management.torch_version, "embedded_python": os.path.split(os.path.split(sys.executable)[0])[1] == "python_embeded", diff --git a/tests-unit/app_test/frontend_manager_test.py b/tests-unit/app_test/frontend_manager_test.py index ce67df6c6..ce43ac564 100644 --- a/tests-unit/app_test/frontend_manager_test.py +++ b/tests-unit/app_test/frontend_manager_test.py @@ -1,7 +1,7 @@ import argparse import pytest from requests.exceptions import HTTPError -from unittest.mock import patch +from unittest.mock import patch, mock_open from app.frontend_management import ( FrontendManager, @@ -172,3 +172,36 @@ def test_init_frontend_fallback_on_error(): # Assert assert frontend_path == "/default/path" mock_check.assert_called_once() + + +def test_get_frontend_version(): + # Arrange + expected_version = "1.25.0" + mock_requirements_content = """torch +torchsde +comfyui-frontend-package==1.25.0 +other-package==1.0.0 +numpy""" + + # Act + with patch("builtins.open", mock_open(read_data=mock_requirements_content)): + version = FrontendManager.get_required_frontend_version() + + # Assert + assert version == expected_version + + +def test_get_frontend_version_invalid_semver(): + # Arrange + mock_requirements_content = """torch +torchsde +comfyui-frontend-package==1.29.3.75 +other-package==1.0.0 +numpy""" + + # Act + with patch("builtins.open", mock_open(read_data=mock_requirements_content)): + version = FrontendManager.get_required_frontend_version() + + # Assert + assert version is None From 69cb57b3426b08a82e7fb713b0b48c23725f3da7 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 24 Jul 2025 12:06:25 -0700 Subject: [PATCH 057/759] Print xpu device name. (#9035) --- comfy/model_management.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/model_management.py b/comfy/model_management.py index 9add54ceb..232d363aa 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -392,6 +392,8 @@ def get_torch_device_name(device): except: allocator_backend = "" return "{} {} : {}".format(device, torch.cuda.get_device_name(device), allocator_backend) + elif device.type == "xpu": + return "{} {}".format(device, torch.xpu.get_device_name(device)) else: return "{}".format(device.type) elif is_intel_xpu(): From 4293e4da214f77a3fde97c15f0691307e61bc18d Mon Sep 17 00:00:00 2001 From: Eugene Fairley Date: Thu, 24 Jul 2025 17:59:19 -0700 Subject: [PATCH 058/759] Add WAN ATI support (#8874) * Add WAN ATI support * Fixes * Fix length * Remove extra functions * Fix * Fix * Ruff fix * Remove torch.no_grad * Add batch trajectory logic * Scale inputs before and after motion patch * Batch image/trajectory * Ruff fix * Clean up --- comfy/utils.py | 20 +++ comfy_extras/nodes_wan.py | 305 +++++++++++++++++++++++++++++++++++++- 2 files changed, 324 insertions(+), 1 deletion(-) diff --git a/comfy/utils.py b/comfy/utils.py index 9c076a0e0..fab28cf08 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -698,6 +698,26 @@ def resize_to_batch_size(tensor, batch_size): return output +def resize_list_to_batch_size(l, batch_size): + in_batch_size = len(l) + if in_batch_size == batch_size or in_batch_size == 0: + return l + + if batch_size <= 1: + return l[:batch_size] + + output = [] + if batch_size < in_batch_size: + scale = (in_batch_size - 1) / (batch_size - 1) + for i in range(batch_size): + output.append(l[min(round(i * scale), in_batch_size - 1)]) + else: + scale = in_batch_size / batch_size + for i in range(batch_size): + output.append(l[min(math.floor((i + 0.5) * scale), in_batch_size - 1)]) + + return output + def convert_sd_to(state_dict, dtype): keys = list(state_dict.keys()) for k in keys: diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index d6097a104..d71908f31 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -1,3 +1,4 @@ +import math import nodes import node_helpers import torch @@ -5,7 +6,9 @@ import comfy.model_management import comfy.utils import comfy.latent_formats import comfy.clip_vision - +import json +import numpy as np +from typing import Tuple class WanImageToVideo: @classmethod @@ -383,7 +386,307 @@ class WanPhantomSubjectToVideo: out_latent["samples"] = latent return (positive, cond2, negative, out_latent) +def parse_json_tracks(tracks): + """Parse JSON track data into a standardized format""" + tracks_data = [] + try: + # If tracks is a string, try to parse it as JSON + if isinstance(tracks, str): + parsed = json.loads(tracks.replace("'", '"')) + tracks_data.extend(parsed) + else: + # If tracks is a list of strings, parse each one + for track_str in tracks: + parsed = json.loads(track_str.replace("'", '"')) + tracks_data.append(parsed) + + # Check if we have a single track (dict with x,y) or a list of tracks + if tracks_data and isinstance(tracks_data[0], dict) and 'x' in tracks_data[0]: + # Single track detected, wrap it in a list + tracks_data = [tracks_data] + elif tracks_data and isinstance(tracks_data[0], list) and tracks_data[0] and isinstance(tracks_data[0][0], dict) and 'x' in tracks_data[0][0]: + # Already a list of tracks, nothing to do + pass + else: + # Unexpected format + pass + + except json.JSONDecodeError: + tracks_data = [] + return tracks_data + +def process_tracks(tracks_np: np.ndarray, frame_size: Tuple[int, int], num_frames, quant_multi: int = 8, **kwargs): + # tracks: shape [t, h, w, 3] => samples align with 24 fps, model trained with 16 fps. + # frame_size: tuple (W, H) + tracks = torch.from_numpy(tracks_np).float() + + if tracks.shape[1] == 121: + tracks = torch.permute(tracks, (1, 0, 2, 3)) + + tracks, visibles = tracks[..., :2], tracks[..., 2:3] + + short_edge = min(*frame_size) + + frame_center = torch.tensor([*frame_size]).type_as(tracks) / 2 + tracks = tracks - frame_center + + tracks = tracks / short_edge * 2 + + visibles = visibles * 2 - 1 + + trange = torch.linspace(-1, 1, tracks.shape[0]).view(-1, 1, 1, 1).expand(*visibles.shape) + + out_ = torch.cat([trange, tracks, visibles], dim=-1).view(121, -1, 4) + + out_0 = out_[:1] + + out_l = out_[1:] # 121 => 120 | 1 + a = 120 // math.gcd(120, num_frames) + b = num_frames // math.gcd(120, num_frames) + out_l = torch.repeat_interleave(out_l, b, dim=0)[1::a] # 120 => 120 * b => 120 * b / a == F + + final_result = torch.cat([out_0, out_l], dim=0) + + return final_result + +FIXED_LENGTH = 121 +def pad_pts(tr): + """Convert list of {x,y} to (FIXED_LENGTH,1,3) array, padding/truncating.""" + pts = np.array([[p['x'], p['y'], 1] for p in tr], dtype=np.float32) + n = pts.shape[0] + if n < FIXED_LENGTH: + pad = np.zeros((FIXED_LENGTH - n, 3), dtype=np.float32) + pts = np.vstack((pts, pad)) + else: + pts = pts[:FIXED_LENGTH] + return pts.reshape(FIXED_LENGTH, 1, 3) + +def ind_sel(target: torch.Tensor, ind: torch.Tensor, dim: int = 1): + """Index selection utility function""" + assert ( + len(ind.shape) > dim + ), "Index must have the target dim, but get dim: %d, ind shape: %s" % (dim, str(ind.shape)) + + target = target.expand( + *tuple( + [ind.shape[k] if target.shape[k] == 1 else -1 for k in range(dim)] + + [ + -1, + ] + * (len(target.shape) - dim) + ) + ) + + ind_pad = ind + + if len(target.shape) > dim + 1: + for _ in range(len(target.shape) - (dim + 1)): + ind_pad = ind_pad.unsqueeze(-1) + ind_pad = ind_pad.expand(*(-1,) * (dim + 1), *target.shape[(dim + 1) : :]) + + return torch.gather(target, dim=dim, index=ind_pad) + +def merge_final(vert_attr: torch.Tensor, weight: torch.Tensor, vert_assign: torch.Tensor): + """Merge vertex attributes with weights""" + target_dim = len(vert_assign.shape) - 1 + if len(vert_attr.shape) == 2: + assert vert_attr.shape[0] > vert_assign.max() + new_shape = [1] * target_dim + list(vert_attr.shape) + tensor = vert_attr.reshape(new_shape) + sel_attr = ind_sel(tensor, vert_assign.type(torch.long), dim=target_dim) + else: + assert vert_attr.shape[1] > vert_assign.max() + new_shape = [vert_attr.shape[0]] + [1] * (target_dim - 1) + list(vert_attr.shape[1:]) + tensor = vert_attr.reshape(new_shape) + sel_attr = ind_sel(tensor, vert_assign.type(torch.long), dim=target_dim) + + final_attr = torch.sum(sel_attr * weight.unsqueeze(-1), dim=-2) + return final_attr + + +def _patch_motion_single( + tracks: torch.FloatTensor, # (B, T, N, 4) + vid: torch.FloatTensor, # (C, T, H, W) + temperature: float, + vae_divide: tuple, + topk: int, +): + """Apply motion patching based on tracks""" + _, T, H, W = vid.shape + N = tracks.shape[2] + _, tracks_xy, visible = torch.split( + tracks, [1, 2, 1], dim=-1 + ) # (B, T, N, 2) | (B, T, N, 1) + tracks_n = tracks_xy / torch.tensor([W / min(H, W), H / min(H, W)], device=tracks_xy.device) + tracks_n = tracks_n.clamp(-1, 1) + visible = visible.clamp(0, 1) + + xx = torch.linspace(-W / min(H, W), W / min(H, W), W) + yy = torch.linspace(-H / min(H, W), H / min(H, W), H) + + grid = torch.stack(torch.meshgrid(yy, xx, indexing="ij")[::-1], dim=-1).to( + tracks_xy.device + ) + + tracks_pad = tracks_xy[:, 1:] + visible_pad = visible[:, 1:] + + visible_align = visible_pad.view(T - 1, 4, *visible_pad.shape[2:]).sum(1) + tracks_align = (tracks_pad * visible_pad).view(T - 1, 4, *tracks_pad.shape[2:]).sum( + 1 + ) / (visible_align + 1e-5) + dist_ = ( + (tracks_align[:, None, None] - grid[None, :, :, None]).pow(2).sum(-1) + ) # T, H, W, N + weight = torch.exp(-dist_ * temperature) * visible_align.clamp(0, 1).view( + T - 1, 1, 1, N + ) + vert_weight, vert_index = torch.topk( + weight, k=min(topk, weight.shape[-1]), dim=-1 + ) + + grid_mode = "bilinear" + point_feature = torch.nn.functional.grid_sample( + vid.permute(1, 0, 2, 3)[:1], + tracks_n[:, :1].type(vid.dtype), + mode=grid_mode, + padding_mode="zeros", + align_corners=False, + ) + point_feature = point_feature.squeeze(0).squeeze(1).permute(1, 0) # N, C=16 + + out_feature = merge_final(point_feature, vert_weight, vert_index).permute(3, 0, 1, 2) # T - 1, H, W, C => C, T - 1, H, W + out_weight = vert_weight.sum(-1) # T - 1, H, W + + # out feature -> already soft weighted + mix_feature = out_feature + vid[:, 1:] * (1 - out_weight.clamp(0, 1)) + + out_feature_full = torch.cat([vid[:, :1], mix_feature], dim=1) # C, T, H, W + out_mask_full = torch.cat([torch.ones_like(out_weight[:1]), out_weight], dim=0) # T, H, W + + return out_mask_full[None].expand(vae_divide[0], -1, -1, -1), out_feature_full + + +def patch_motion( + tracks: torch.FloatTensor, # (B, TB, T, N, 4) + vid: torch.FloatTensor, # (C, T, H, W) + temperature: float = 220.0, + vae_divide: tuple = (4, 16), + topk: int = 2, +): + B = len(tracks) + + # Process each batch separately + out_masks = [] + out_features = [] + + for b in range(B): + mask, feature = _patch_motion_single( + tracks[b], # (T, N, 4) + vid[b], # (C, T, H, W) + temperature, + vae_divide, + topk + ) + out_masks.append(mask) + out_features.append(feature) + + # Stack results: (B, C, T, H, W) + out_mask_full = torch.stack(out_masks, dim=0) + out_feature_full = torch.stack(out_features, dim=0) + + return out_mask_full, out_feature_full + +class WanTrackToVideo: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "positive": ("CONDITIONING", ), + "negative": ("CONDITIONING", ), + "vae": ("VAE", ), + "tracks": ("STRING", {"multiline": True, "default": "[]"}), + "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), + "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), + "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), + "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), + "temperature": ("FLOAT", {"default": 220.0, "min": 1.0, "max": 1000.0, "step": 0.1}), + "topk": ("INT", {"default": 2, "min": 1, "max": 10}), + "start_image": ("IMAGE", ), + }, + "optional": { + "clip_vision_output": ("CLIP_VISION_OUTPUT", ), + }} + + RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") + RETURN_NAMES = ("positive", "negative", "latent") + FUNCTION = "encode" + + CATEGORY = "conditioning/video_models" + + def encode(self, positive, negative, vae, tracks, width, height, length, batch_size, + temperature, topk, start_image=None, clip_vision_output=None): + + tracks_data = parse_json_tracks(tracks) + + if not tracks_data: + return WanImageToVideo().encode(positive, negative, vae, width, height, length, batch_size, start_image=start_image, clip_vision_output=clip_vision_output) + + latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], + device=comfy.model_management.intermediate_device()) + + if isinstance(tracks_data[0][0], dict): + tracks_data = [tracks_data] + + processed_tracks = [] + for batch in tracks_data: + arrs = [] + for track in batch: + pts = pad_pts(track) + arrs.append(pts) + + tracks_np = np.stack(arrs, axis=0) + processed_tracks.append(process_tracks(tracks_np, (width, height), length - 1).unsqueeze(0)) + + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:batch_size].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + videos = torch.ones((start_image.shape[0], length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) * 0.5 + for i in range(start_image.shape[0]): + videos[i, 0] = start_image[i] + + latent_videos = [] + videos = comfy.utils.resize_to_batch_size(videos, batch_size) + for i in range(batch_size): + latent_videos += [vae.encode(videos[i, :, :, :, :3])] + y = torch.cat(latent_videos, dim=0) + + # Scale latent since patch_motion is non-linear + y = comfy.latent_formats.Wan21().process_in(y) + + processed_tracks = comfy.utils.resize_list_to_batch_size(processed_tracks, batch_size) + res = patch_motion( + processed_tracks, y, temperature=temperature, topk=topk, vae_divide=(4, 16) + ) + + mask, concat_latent_image = res + concat_latent_image = comfy.latent_formats.Wan21().process_out(concat_latent_image) + mask = -mask + 1.0 # Invert mask to match expected format + positive = node_helpers.conditioning_set_values(positive, + {"concat_mask": mask, + "concat_latent_image": concat_latent_image}) + negative = node_helpers.conditioning_set_values(negative, + {"concat_mask": mask, + "concat_latent_image": concat_latent_image}) + + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + out_latent = {} + out_latent["samples"] = latent + return (positive, negative, out_latent) + NODE_CLASS_MAPPINGS = { + "WanTrackToVideo": WanTrackToVideo, "WanImageToVideo": WanImageToVideo, "WanFunControlToVideo": WanFunControlToVideo, "WanFunInpaintToVideo": WanFunInpaintToVideo, From e6e5d33b351fc5ed8334d74dac77b283ecea8708 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 25 Jul 2025 01:58:28 -0700 Subject: [PATCH 059/759] Remove useless code. (#9041) This is only needed on old pytorch 2.0 and older. --- comfy/ldm/wan/vae.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/comfy/ldm/wan/vae.py b/comfy/ldm/wan/vae.py index a8ebc5ec6..a83c6edfd 100644 --- a/comfy/ldm/wan/vae.py +++ b/comfy/ldm/wan/vae.py @@ -52,15 +52,6 @@ class RMS_norm(nn.Module): x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma.to(x) + (self.bias.to(x) if self.bias is not None else 0) -class Upsample(nn.Upsample): - - def forward(self, x): - """ - Fix bfloat16 support for nearest neighbor interpolation. - """ - return super().forward(x.float()).type_as(x) - - class Resample(nn.Module): def __init__(self, dim, mode): @@ -73,11 +64,11 @@ class Resample(nn.Module): # layers if mode == 'upsample2d': self.resample = nn.Sequential( - Upsample(scale_factor=(2., 2.), mode='nearest-exact'), + nn.Upsample(scale_factor=(2., 2.), mode='nearest-exact'), ops.Conv2d(dim, dim // 2, 3, padding=1)) elif mode == 'upsample3d': self.resample = nn.Sequential( - Upsample(scale_factor=(2., 2.), mode='nearest-exact'), + nn.Upsample(scale_factor=(2., 2.), mode='nearest-exact'), ops.Conv2d(dim, dim // 2, 3, padding=1)) self.time_conv = CausalConv3d( dim, dim * 2, (3, 1, 1), padding=(1, 0, 0)) From 93bc2f8e4d5dace2328b861579df24f91684e27e Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sat, 26 Jul 2025 01:24:23 +0800 Subject: [PATCH 060/759] Update template to 0.1.40 (#9048) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8f6a6d112..33a59b4be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.39 +comfyui-workflow-templates==0.1.40 comfyui-embedded-docs==0.2.4 torch torchsde From c0207b473fa9ad413fad6d5658449356e39758cc Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 25 Jul 2025 14:25:08 -0700 Subject: [PATCH 061/759] Fix issue with line endings github workflow. (#9053) --- .github/workflows/check-line-endings.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/check-line-endings.yml b/.github/workflows/check-line-endings.yml index 03b3e3ced..eeb594d6c 100644 --- a/.github/workflows/check-line-endings.yml +++ b/.github/workflows/check-line-endings.yml @@ -17,8 +17,7 @@ jobs: - name: Check for Windows line endings (CRLF) run: | # Get the list of changed files in the PR - git merge origin/${{ github.base_ref }} --no-edit - CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }}..HEAD) + CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }}) # Flag to track if CRLF is found CRLF_FOUND=false From c60467a148c2b6f7b9fe47725362361f2de9ae50 Mon Sep 17 00:00:00 2001 From: Thor-ATX Date: Sat, 26 Jul 2025 09:27:03 +1200 Subject: [PATCH 062/759] Update negative prompt for Moonvalley nodes (#9038) Co-authored-by: thorsten --- comfy_api_nodes/nodes_moonvalley.py | 359 +++++++++++++++++----------- 1 file changed, 224 insertions(+), 135 deletions(-) diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 6e937411c..057021efa 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -2,7 +2,11 @@ import logging from typing import Any, Callable, Optional, TypeVar import random import torch -from comfy_api_nodes.util.validation_utils import get_image_dimensions, validate_image_dimensions, validate_video_dimensions +from comfy_api_nodes.util.validation_utils import ( + get_image_dimensions, + validate_image_dimensions, + validate_video_dimensions, +) from comfy_api_nodes.apis import ( @@ -10,7 +14,7 @@ from comfy_api_nodes.apis import ( MoonvalleyTextToVideoInferenceParams, MoonvalleyVideoToVideoInferenceParams, MoonvalleyVideoToVideoRequest, - MoonvalleyPromptResponse + MoonvalleyPromptResponse, ) from comfy_api_nodes.apis.client import ( ApiEndpoint, @@ -54,20 +58,26 @@ MAX_VIDEO_SIZE = 1024 * 1024 * 1024 # 1 GB max for in-memory video processing MOONVALLEY_MAREY_MAX_PROMPT_LENGTH = 5000 R = TypeVar("R") + + class MoonvalleyApiError(Exception): """Base exception for Moonvalley API errors.""" + pass + def is_valid_task_creation_response(response: MoonvalleyPromptResponse) -> bool: """Verifies that the initial response contains a task ID.""" return bool(response.id) + def validate_task_creation_response(response) -> None: if not is_valid_task_creation_response(response): error_msg = f"Moonvalley Marey API: Initial request failed. Code: {response.code}, Message: {response.message}, Data: {response}" logging.error(error_msg) raise MoonvalleyApiError(error_msg) + def get_video_from_response(response): video = response.output_url logging.info( @@ -102,16 +112,17 @@ def poll_until_finished( poll_interval=16.0, failed_statuses=["error"], status_extractor=lambda response: ( - response.status - if response and response.status - else None + response.status if response and response.status else None ), auth_kwargs=auth_kwargs, result_url_extractor=result_url_extractor, node_id=node_id, ).execute() -def validate_prompts(prompt:str, negative_prompt: str, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH): + +def validate_prompts( + prompt: str, negative_prompt: str, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH +): """Verifies that the prompt isn't empty and that neither prompt is too long.""" if not prompt: raise ValueError("Positive prompt is empty") @@ -123,16 +134,15 @@ def validate_prompts(prompt:str, negative_prompt: str, max_length=MOONVALLEY_MAR ) return True + def validate_input_media(width, height, with_frame_conditioning, num_frames_in=None): - # inference validation - # T = num_frames - # in all cases, the following must be true: T divisible by 16 and H,W by 8. in addition... - # with image conditioning: H*W must be divisible by 8192 - # without image conditioning: T divisible by 32 - if num_frames_in and not num_frames_in % 16 == 0 : - return False, ( - "The input video total frame count must be divisible by 16!" - ) + # inference validation + # T = num_frames + # in all cases, the following must be true: T divisible by 16 and H,W by 8. in addition... + # with image conditioning: H*W must be divisible by 8192 + # without image conditioning: T divisible by 32 + if num_frames_in and not num_frames_in % 16 == 0: + return False, ("The input video total frame count must be divisible by 16!") if height % 8 != 0 or width % 8 != 0: return False, ( @@ -146,13 +156,13 @@ def validate_input_media(width, height, with_frame_conditioning, num_frames_in=N "divisible by 8192 for frame conditioning" ) else: - if num_frames_in and not num_frames_in % 32 == 0 : - return False, ( - "The input video total frame count must be divisible by 32!" - ) + if num_frames_in and not num_frames_in % 32 == 0: + return False, ("The input video total frame count must be divisible by 32!") -def validate_input_image(image: torch.Tensor, with_frame_conditioning: bool=False) -> None: +def validate_input_image( + image: torch.Tensor, with_frame_conditioning: bool = False +) -> None: """ Validates the input image adheres to the expectations of the API: - The image resolution should not be less than 300*300px @@ -160,10 +170,15 @@ def validate_input_image(image: torch.Tensor, with_frame_conditioning: bool=Fals """ height, width = get_image_dimensions(image) - validate_input_media(width, height, with_frame_conditioning ) - validate_image_dimensions(image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH) + validate_input_media(width, height, with_frame_conditioning) + validate_image_dimensions( + image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH + ) -def validate_input_video(video: VideoInput, num_frames_out: int, with_frame_conditioning: bool=False): + +def validate_input_video( + video: VideoInput, num_frames_out: int, with_frame_conditioning: bool = False +): try: width, height = video.get_dimensions() except Exception as e: @@ -171,7 +186,13 @@ def validate_input_video(video: VideoInput, num_frames_out: int, with_frame_cond raise ValueError(f"Cannot get video dimensions: {e}") from e validate_input_media(width, height, with_frame_conditioning) - validate_video_dimensions(video, min_width=MIN_VID_WIDTH, min_height=MIN_VID_HEIGHT, max_width=MAX_VID_WIDTH, max_height=MAX_VID_HEIGHT) + validate_video_dimensions( + video, + min_width=MIN_VID_WIDTH, + min_height=MIN_VID_HEIGHT, + max_width=MAX_VID_WIDTH, + max_height=MAX_VID_HEIGHT, + ) trimmed_video = validate_input_video_length(video, num_frames_out) return trimmed_video @@ -180,22 +201,29 @@ def validate_input_video(video: VideoInput, num_frames_out: int, with_frame_cond def validate_input_video_length(video: VideoInput, num_frames: int): if video.get_duration() > 60: - raise MoonvalleyApiError("Input Video lenth should be less than 1min. Please trim.") + raise MoonvalleyApiError( + "Input Video lenth should be less than 1min. Please trim." + ) if num_frames == 128: - if video.get_duration() < 5: - raise MoonvalleyApiError("Input Video length is less than 5s. Please use a video longer than or equal to 5s.") - if video.get_duration() > 5: - # trim video to 5s - video = trim_video(video, 5) + if video.get_duration() < 5: + raise MoonvalleyApiError( + "Input Video length is less than 5s. Please use a video longer than or equal to 5s." + ) + if video.get_duration() > 5: + # trim video to 5s + video = trim_video(video, 5) if num_frames == 256: if video.get_duration() < 10: - raise MoonvalleyApiError("Input Video length is less than 10s. Please use a video longer than or equal to 10s.") + raise MoonvalleyApiError( + "Input Video length is less than 10s. Please use a video longer than or equal to 10s." + ) if video.get_duration() > 10: # trim video to 10s video = trim_video(video, 10) return video + def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: """ Returns a new VideoInput object trimmed from the beginning to the specified duration, @@ -219,8 +247,8 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: input_source = video.get_stream_source() # Open containers - input_container = av.open(input_source, mode='r') - output_container = av.open(output_buffer, mode='w', format='mp4') + input_container = av.open(input_source, mode="r") + output_container = av.open(output_buffer, mode="w", format="mp4") # Set up output streams for re-encoding video_stream = None @@ -230,22 +258,32 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: logging.info(f"Found stream: type={stream.type}, class={type(stream)}") if isinstance(stream, av.VideoStream): # Create output video stream with same parameters - video_stream = output_container.add_stream('h264', rate=stream.average_rate) + video_stream = output_container.add_stream( + "h264", rate=stream.average_rate + ) video_stream.width = stream.width video_stream.height = stream.height - video_stream.pix_fmt = 'yuv420p' - logging.info(f"Added video stream: {stream.width}x{stream.height} @ {stream.average_rate}fps") + video_stream.pix_fmt = "yuv420p" + logging.info( + f"Added video stream: {stream.width}x{stream.height} @ {stream.average_rate}fps" + ) elif isinstance(stream, av.AudioStream): # Create output audio stream with same parameters - audio_stream = output_container.add_stream('aac', rate=stream.sample_rate) + audio_stream = output_container.add_stream( + "aac", rate=stream.sample_rate + ) audio_stream.sample_rate = stream.sample_rate audio_stream.layout = stream.layout - logging.info(f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels") + logging.info( + f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels" + ) # Calculate target frame count that's divisible by 32 fps = input_container.streams.video[0].average_rate estimated_frames = int(duration_sec * fps) - target_frames = (estimated_frames // 32) * 32 # Round down to nearest multiple of 32 + target_frames = ( + estimated_frames // 32 + ) * 32 # Round down to nearest multiple of 32 if target_frames == 0: raise ValueError("Video too short: need at least 32 frames for Moonvalley") @@ -268,7 +306,9 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: for packet in video_stream.encode(): output_container.mux(packet) - logging.info(f"Encoded {frame_count} video frames (target: {target_frames})") + logging.info( + f"Encoded {frame_count} video frames (target: {target_frames})" + ) # Decode and re-encode audio frames if audio_stream: @@ -292,7 +332,6 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: output_container.close() input_container.close() - # Return as VideoFromFile using the buffer output_buffer.seek(0) return VideoFromFile(output_buffer) @@ -305,6 +344,7 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: output_container.close() raise RuntimeError(f"Failed to trim video: {str(e)}") from e + # --- BaseMoonvalleyVideoNode --- class BaseMoonvalleyVideoNode: def parseWidthHeightFromRes(self, resolution: str): @@ -328,7 +368,7 @@ class BaseMoonvalleyVideoNode: "Motion Transfer": "motion_control", "Canny": "canny_control", "Pose Transfer": "pose_control", - "Depth": "depth_control" + "Depth": "depth_control", } if value in control_map: return control_map[value] @@ -355,31 +395,63 @@ class BaseMoonvalleyVideoNode: return { "required": { "prompt": model_field_to_node_input( - IO.STRING, MoonvalleyTextToVideoRequest, "prompt_text", - multiline=True + IO.STRING, + MoonvalleyTextToVideoRequest, + "prompt_text", + multiline=True, ), "negative_prompt": model_field_to_node_input( IO.STRING, MoonvalleyTextToVideoInferenceParams, "negative_prompt", multiline=True, - default="gopro, bright, contrast, static, overexposed, bright, vignette, artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, flare, saturation, distorted, warped, wide angle, contrast, saturated, vibrant, glowing, cross dissolve, texture, videogame, saturation, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, blown out, horrible, blurry, worst quality, bad, transition, dissolve, cross-dissolve, melt, fade in, fade out, wobbly, weird, low quality, plastic, stock footage, video camera, boring, static", + default="low-poly, flat shader, bad rigging, stiff animation, uncanny eyes, low-quality textures, looping glitch, cheap effect, overbloom, bloom spam, default lighting, game asset, stiff face, ugly specular, AI artifacts", ), - - "resolution": (IO.COMBO, { - "options": ["16:9 (1920 x 1080)", - "9:16 (1080 x 1920)", - "1:1 (1152 x 1152)", - "4:3 (1440 x 1080)", - "3:4 (1080 x 1440)", - "21:9 (2560 x 1080)"], + "resolution": ( + IO.COMBO, + { + "options": [ + "16:9 (1920 x 1080)", + "9:16 (1080 x 1920)", + "1:1 (1152 x 1152)", + "4:3 (1440 x 1080)", + "3:4 (1080 x 1440)", + "21:9 (2560 x 1080)", + ], "default": "16:9 (1920 x 1080)", "tooltip": "Resolution of the output video", - }), + }, + ), # "length": (IO.COMBO,{"options":['5s','10s'], "default": '5s'}), - "prompt_adherence": model_field_to_node_input(IO.FLOAT,MoonvalleyTextToVideoInferenceParams,"guidance_scale",default=7.0, step=1, min=1, max=20), - "seed": model_field_to_node_input(IO.INT,MoonvalleyTextToVideoInferenceParams, "seed", default=random.randint(0, 2**32 - 1), min=0, max=4294967295, step=1, display="number", tooltip="Random seed value", control_after_generate=True), - "steps": model_field_to_node_input(IO.INT, MoonvalleyTextToVideoInferenceParams, "steps", default=100, min=1, max=100), + "prompt_adherence": model_field_to_node_input( + IO.FLOAT, + MoonvalleyTextToVideoInferenceParams, + "guidance_scale", + default=7.0, + step=1, + min=1, + max=20, + ), + "seed": model_field_to_node_input( + IO.INT, + MoonvalleyTextToVideoInferenceParams, + "seed", + default=random.randint(0, 2**32 - 1), + min=0, + max=4294967295, + step=1, + display="number", + tooltip="Random seed value", + control_after_generate=True, + ), + "steps": model_field_to_node_input( + IO.INT, + MoonvalleyTextToVideoInferenceParams, + "steps", + default=100, + min=1, + max=100, + ), }, "hidden": { "auth_token": "AUTH_TOKEN_COMFY_ORG", @@ -393,7 +465,7 @@ class BaseMoonvalleyVideoNode: "image_url", tooltip="The reference image used to generate the video", ), - } + }, } RETURN_TYPES = ("STRING",) @@ -404,6 +476,7 @@ class BaseMoonvalleyVideoNode: def generate(self, **kwargs): return None + # --- MoonvalleyImg2VideoNode --- class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): @@ -415,43 +488,46 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): RETURN_NAMES = ("video",) DESCRIPTION = "Moonvalley Marey Image to Video Node" - def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs): + def generate( + self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs + ): image = kwargs.get("image", None) - if (image is None): + if image is None: raise MoonvalleyApiError("image is required") total_frames = get_total_frames_from_length() - validate_input_image(image,True) + validate_input_image(image, True) validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) - inference_params=MoonvalleyTextToVideoInferenceParams( - negative_prompt=negative_prompt, - steps=kwargs.get("steps"), - seed=kwargs.get("seed"), - guidance_scale=kwargs.get("prompt_adherence"), - num_frames=total_frames, - width=width_height.get("width"), - height=width_height.get("height"), - use_negative_prompts=True - ) + inference_params = MoonvalleyTextToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + num_frames=total_frames, + width=width_height.get("width"), + height=width_height.get("height"), + use_negative_prompts=True, + ) """Upload image to comfy backend to have a URL available for further processing""" # Get MIME type from tensor - assuming PNG format for image tensors mime_type = "image/png" - image_url = upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type)[0] + image_url = upload_images_to_comfyapi( + image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type + )[0] request = MoonvalleyTextToVideoRequest( - image_url=image_url, - prompt_text=prompt, - inference_params=inference_params - ) + image_url=image_url, prompt_text=prompt, inference_params=inference_params + ) initial_operation = SynchronousOperation( - endpoint=ApiEndpoint(path=API_IMG2VIDEO_ENDPOINT, - method=HttpMethod.POST, - request_model=MoonvalleyTextToVideoRequest, - response_model=MoonvalleyPromptResponse - ), + endpoint=ApiEndpoint( + path=API_IMG2VIDEO_ENDPOINT, + method=HttpMethod.POST, + request_model=MoonvalleyTextToVideoRequest, + response_model=MoonvalleyPromptResponse, + ), request=request, auth_kwargs=kwargs, ) @@ -463,7 +539,8 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): task_id, auth_kwargs=kwargs, node_id=unique_id ) video = download_url_to_video_output(final_response.output_url) - return (video, ) + return (video,) + # --- MoonvalleyVid2VidNode --- class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): @@ -479,38 +556,46 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): if param in input_types["optional"]: del input_types["optional"][param] input_types["optional"] = { - "video": (IO.VIDEO, {"default": "", "multiline": False, "tooltip": "The reference video used to generate the output video. Input a 5s video for 128 frames and a 10s video for 256 frames. Longer videos will be trimmed automatically."}), - "control_type": ( - ["Motion Transfer", "Pose Transfer"], - {"default": "Motion Transfer"}, - ), - "motion_intensity": ( - "INT", - { - "default": 100, - "step": 1, - "min": 0, - "max": 100, - "tooltip": "Only used if control_type is 'Motion Transfer'", - }, - ) - } + "video": ( + IO.VIDEO, + { + "default": "", + "multiline": False, + "tooltip": "The reference video used to generate the output video. Input a 5s video for 128 frames and a 10s video for 256 frames. Longer videos will be trimmed automatically.", + }, + ), + "control_type": ( + ["Motion Transfer", "Pose Transfer"], + {"default": "Motion Transfer"}, + ), + "motion_intensity": ( + "INT", + { + "default": 100, + "step": 1, + "min": 0, + "max": 100, + "tooltip": "Only used if control_type is 'Motion Transfer'", + }, + ), + } return input_types RETURN_TYPES = ("VIDEO",) RETURN_NAMES = ("video",) - def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs): + def generate( + self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs + ): video = kwargs.get("video") num_frames = get_total_frames_from_length() - if not video : + if not video: raise MoonvalleyApiError("video is required") - """Validate video input""" - video_url="" + video_url = "" if video: validated_video = validate_input_video(video, num_frames, False) video_url = upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs) @@ -520,29 +605,30 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): """Validate prompts and inference input""" validate_prompts(prompt, negative_prompt) - inference_params=MoonvalleyVideoToVideoInferenceParams( + inference_params = MoonvalleyVideoToVideoInferenceParams( negative_prompt=negative_prompt, steps=kwargs.get("steps"), seed=kwargs.get("seed"), guidance_scale=kwargs.get("prompt_adherence"), - control_params={'motion_intensity': motion_intensity} + control_params={"motion_intensity": motion_intensity}, ) control = self.parseControlParameter(control_type) request = MoonvalleyVideoToVideoRequest( - control_type=control, - video_url=video_url, - prompt_text=prompt, - inference_params=inference_params - ) + control_type=control, + video_url=video_url, + prompt_text=prompt, + inference_params=inference_params, + ) initial_operation = SynchronousOperation( - endpoint=ApiEndpoint(path=API_VIDEO2VIDEO_ENDPOINT, - method=HttpMethod.POST, - request_model=MoonvalleyVideoToVideoRequest, - response_model=MoonvalleyPromptResponse - ), + endpoint=ApiEndpoint( + path=API_VIDEO2VIDEO_ENDPOINT, + method=HttpMethod.POST, + request_model=MoonvalleyVideoToVideoRequest, + response_model=MoonvalleyPromptResponse, + ), request=request, auth_kwargs=kwargs, ) @@ -556,7 +642,8 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): video = download_url_to_video_output(final_response.output_url) - return (video, ) + return (video,) + # --- MoonvalleyTxt2VideoNode --- class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): @@ -575,31 +662,33 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): del input_types["optional"][param] return input_types - def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs): + def generate( + self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs + ): validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) num_frames = get_total_frames_from_length() - inference_params=MoonvalleyTextToVideoInferenceParams( - negative_prompt=negative_prompt, - steps=kwargs.get("steps"), - seed=kwargs.get("seed"), - guidance_scale=kwargs.get("prompt_adherence"), - num_frames=num_frames, - width=width_height.get("width"), - height=width_height.get("height"), - ) + inference_params = MoonvalleyTextToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + num_frames=num_frames, + width=width_height.get("width"), + height=width_height.get("height"), + ) request = MoonvalleyTextToVideoRequest( - prompt_text=prompt, - inference_params=inference_params - ) + prompt_text=prompt, inference_params=inference_params + ) initial_operation = SynchronousOperation( - endpoint=ApiEndpoint(path=API_TXT2VIDEO_ENDPOINT, - method=HttpMethod.POST, - request_model=MoonvalleyTextToVideoRequest, - response_model=MoonvalleyPromptResponse - ), + endpoint=ApiEndpoint( + path=API_TXT2VIDEO_ENDPOINT, + method=HttpMethod.POST, + request_model=MoonvalleyTextToVideoRequest, + response_model=MoonvalleyPromptResponse, + ), request=request, auth_kwargs=kwargs, ) @@ -612,8 +701,7 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): ) video = download_url_to_video_output(final_response.output_url) - return (video, ) - + return (video,) NODE_CLASS_MAPPINGS = { @@ -629,6 +717,7 @@ NODE_DISPLAY_NAME_MAPPINGS = { # "MoonvalleyVideo2VideoNode": "Moonvalley Marey Video to Video", } + def get_total_frames_from_length(length="5s"): # if length == '5s': # return 128 From b850d9a8bb2c99989fe79d1fded26ab5c103c7b2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 25 Jul 2025 18:25:45 -0700 Subject: [PATCH 063/759] Add map_function to get_history. (#9056) --- execution.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/execution.py b/execution.py index c3a62f1cb..8a9663a7d 100644 --- a/execution.py +++ b/execution.py @@ -1097,7 +1097,7 @@ class PromptQueue: return True return False - def get_history(self, prompt_id=None, max_items=None, offset=-1): + def get_history(self, prompt_id=None, max_items=None, offset=-1, map_function=None): with self.mutex: if prompt_id is None: out = {} @@ -1106,13 +1106,21 @@ class PromptQueue: offset = len(self.history) - max_items for k in self.history: if i >= offset: - out[k] = self.history[k] + p = self.history[k] + if map_function is not None: + p = map_function(p) + out[k] = p if max_items is not None and len(out) >= max_items: break i += 1 return out elif prompt_id in self.history: - return {prompt_id: copy.deepcopy(self.history[prompt_id])} + p = self.history[prompt_id] + if map_function is None: + p = copy.deepcopy(p) + else: + p = map_function(p) + return {prompt_id: p} else: return {} From 0621d73a9c56fdc9e79aad87ed260135639bca50 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 26 Jul 2025 01:44:19 -0700 Subject: [PATCH 064/759] Remove useless code. (#9059) --- comfy/ldm/wan/vae.py | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/comfy/ldm/wan/vae.py b/comfy/ldm/wan/vae.py index a83c6edfd..6b07840fc 100644 --- a/comfy/ldm/wan/vae.py +++ b/comfy/ldm/wan/vae.py @@ -148,29 +148,6 @@ class Resample(nn.Module): feat_idx[0] += 1 return x - def init_weight(self, conv): - conv_weight = conv.weight - nn.init.zeros_(conv_weight) - c1, c2, t, h, w = conv_weight.size() - one_matrix = torch.eye(c1, c2) - init_matrix = one_matrix - nn.init.zeros_(conv_weight) - #conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5 - conv_weight.data[:, :, 1, 0, 0] = init_matrix #* 0.5 - conv.weight.data.copy_(conv_weight) - nn.init.zeros_(conv.bias.data) - - def init_weight2(self, conv): - conv_weight = conv.weight.data - nn.init.zeros_(conv_weight) - c1, c2, t, h, w = conv_weight.size() - init_matrix = torch.eye(c1 // 2, c2) - #init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2) - conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix - conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix - conv.weight.data.copy_(conv_weight) - nn.init.zeros_(conv.bias.data) - class ResidualBlock(nn.Module): @@ -485,12 +462,6 @@ class WanVAE(nn.Module): self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temperal_upsample, dropout) - def forward(self, x): - mu, log_var = self.encode(x) - z = self.reparameterize(mu, log_var) - x_recon = self.decode(z) - return x_recon, mu, log_var - def encode(self, x): self.clear_cache() ## cache @@ -536,18 +507,6 @@ class WanVAE(nn.Module): self.clear_cache() return out - def reparameterize(self, mu, log_var): - std = torch.exp(0.5 * log_var) - eps = torch.randn_like(std) - return eps * std + mu - - def sample(self, imgs, deterministic=False): - mu, log_var = self.encode(imgs) - if deterministic: - return mu - std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0)) - return mu + std * torch.randn_like(std) - def clear_cache(self): self._conv_num = count_conv3d(self.decoder) self._conv_idx = [0] From 1ef70fcde4b84e8cd743c4f1fd9cdce24bbadbad Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sun, 27 Jul 2025 05:25:33 +0800 Subject: [PATCH 065/759] Fix the broken link (#9060) --- comfy_api_nodes/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api_nodes/README.md b/comfy_api_nodes/README.md index 64a389cc1..f56d6c860 100644 --- a/comfy_api_nodes/README.md +++ b/comfy_api_nodes/README.md @@ -2,7 +2,7 @@ ## Introduction -Below are a collection of nodes that work by calling external APIs. More information available in our [docs](https://docs.comfy.org/tutorials/api-nodes/overview#api-nodes). +Below are a collection of nodes that work by calling external APIs. More information available in our [docs](https://docs.comfy.org/tutorials/api-nodes/overview). ## Development From 78672d0ee6d20d8269f324474643e5cc00f1c348 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 27 Jul 2025 04:42:58 -0700 Subject: [PATCH 066/759] Small readme update. (#9071) --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a148623cd..8a15136aa 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith ## Features - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything. - Image Models - - SD1.x, SD2.x, + - SD1.x, SD2.x ([unCLIP](https://comfyanonymous.github.io/ComfyUI_examples/unclip/)) - [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/) - [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/) - [SD3 and SD3.5](https://comfyanonymous.github.io/ComfyUI_examples/sd3/) @@ -84,9 +84,9 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Hunyuan3D 2.0](https://docs.comfy.org/tutorials/3d/hunyuan3D-2) - Asynchronous Queue system - Many optimizations: Only re-executes the parts of the workflow that changes between executions. -- Smart memory management: can automatically run models on GPUs with as low as 1GB vram. +- Smart memory management: can automatically run large models on GPUs with as low as 1GB vram with smart offloading. - Works even if you don't have a GPU with: ```--cpu``` (slow) -- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs and CLIP models. +- Can load ckpt and safetensors: All in one checkpoints or standalone diffusion models, VAEs and CLIP models. - Safe loading of ckpt, pt, pth, etc.. files. - Embeddings/Textual inversion - [Loras (regular, locon and loha)](https://comfyanonymous.github.io/ComfyUI_examples/lora/) @@ -98,7 +98,6 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Inpainting](https://comfyanonymous.github.io/ComfyUI_examples/inpaint/) with both regular and inpainting models. - [ControlNet and T2I-Adapter](https://comfyanonymous.github.io/ComfyUI_examples/controlnet/) - [Upscale Models (ESRGAN, ESRGAN variants, SwinIR, Swin2SR, etc...)](https://comfyanonymous.github.io/ComfyUI_examples/upscale_models/) -- [unCLIP Models](https://comfyanonymous.github.io/ComfyUI_examples/unclip/) - [GLIGEN](https://comfyanonymous.github.io/ComfyUI_examples/gligen/) - [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/) - [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/) From e6d9f6274494c5ac96295deb1bea54de50189059 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Sun, 27 Jul 2025 16:51:36 -0700 Subject: [PATCH 067/759] Add Moonvalley Marey V2V node with updated input validation (#9069) * [moonvalley] Update V2V node to match API specification - Add exact resolution validation for supported resolutions (1920x1080, 1080x1920, 1152x1152, 1536x1152, 1152x1536) - Change frame count validation from divisible by 32 to 16 - Add MP4 container format validation - Remove internal parameters (steps, guidance_scale) from V2V inference params - Update video duration handling to support only 5 seconds (auto-trim if longer) - Add motion_intensity parameter (0-100) for Motion Transfer control type - Add get_container_format() method to VideoInput classes * update negative prompt --- comfy_api/input/video_types.py | 13 ++ comfy_api/input_impl/video_types.py | 12 ++ comfy_api_nodes/nodes_moonvalley.py | 225 +++++++++++++++------------- 3 files changed, 145 insertions(+), 105 deletions(-) diff --git a/comfy_api/input/video_types.py b/comfy_api/input/video_types.py index bb936e0a4..5d95dc507 100644 --- a/comfy_api/input/video_types.py +++ b/comfy_api/input/video_types.py @@ -2,6 +2,7 @@ from __future__ import annotations from abc import ABC, abstractmethod from typing import Optional, Union import io +import av from comfy_api.util import VideoContainer, VideoCodec, VideoComponents class VideoInput(ABC): @@ -70,3 +71,15 @@ class VideoInput(ABC): components = self.get_components() frame_count = components.images.shape[0] return float(frame_count / components.frame_rate) + + def get_container_format(self) -> str: + """ + Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). + + Returns: + Container format as string + """ + # Default implementation - subclasses should override for better performance + source = self.get_stream_source() + with av.open(source, mode="r") as container: + return container.format.name diff --git a/comfy_api/input_impl/video_types.py b/comfy_api/input_impl/video_types.py index 9ae818f4e..91e7c1bfa 100644 --- a/comfy_api/input_impl/video_types.py +++ b/comfy_api/input_impl/video_types.py @@ -121,6 +121,18 @@ class VideoFromFile(VideoInput): raise ValueError(f"Could not determine duration for file '{self.__file}'") + def get_container_format(self) -> str: + """ + Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). + + Returns: + Container format as string + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) + with av.open(self.__file, mode='r') as container: + return container.format.name + def get_components_internal(self, container: InputContainer) -> VideoComponents: # Get video frames frames = [] diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 057021efa..789fcef02 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -5,7 +5,6 @@ import torch from comfy_api_nodes.util.validation_utils import ( get_image_dimensions, validate_image_dimensions, - validate_video_dimensions, ) @@ -176,54 +175,76 @@ def validate_input_image( ) -def validate_input_video( - video: VideoInput, num_frames_out: int, with_frame_conditioning: bool = False -): +def validate_video_to_video_input(video: VideoInput) -> VideoInput: + """ + Validates and processes video input for Moonvalley Video-to-Video generation. + + Args: + video: Input video to validate + + Returns: + Validated and potentially trimmed video + + Raises: + ValueError: If video doesn't meet requirements + MoonvalleyApiError: If video duration is too short + """ + width, height = _get_video_dimensions(video) + _validate_video_dimensions(width, height) + _validate_container_format(video) + + return _validate_and_trim_duration(video) + + +def _get_video_dimensions(video: VideoInput) -> tuple[int, int]: + """Extracts video dimensions with error handling.""" try: - width, height = video.get_dimensions() + return video.get_dimensions() except Exception as e: logging.error("Error getting dimensions of video: %s", e) raise ValueError(f"Cannot get video dimensions: {e}") from e - validate_input_media(width, height, with_frame_conditioning) - validate_video_dimensions( - video, - min_width=MIN_VID_WIDTH, - min_height=MIN_VID_HEIGHT, - max_width=MAX_VID_WIDTH, - max_height=MAX_VID_HEIGHT, - ) - trimmed_video = validate_input_video_length(video, num_frames_out) - return trimmed_video +def _validate_video_dimensions(width: int, height: int) -> None: + """Validates video dimensions meet Moonvalley V2V requirements.""" + supported_resolutions = { + (1920, 1080), (1080, 1920), (1152, 1152), + (1536, 1152), (1152, 1536) + } + + if (width, height) not in supported_resolutions: + supported_list = ', '.join([f'{w}x{h}' for w, h in sorted(supported_resolutions)]) + raise ValueError(f"Resolution {width}x{height} not supported. Supported: {supported_list}") -def validate_input_video_length(video: VideoInput, num_frames: int): +def _validate_container_format(video: VideoInput) -> None: + """Validates video container format is MP4.""" + container_format = video.get_container_format() + if container_format not in ['mp4', 'mov,mp4,m4a,3gp,3g2,mj2']: + raise ValueError(f"Only MP4 container format supported. Got: {container_format}") - if video.get_duration() > 60: - raise MoonvalleyApiError( - "Input Video lenth should be less than 1min. Please trim." - ) - if num_frames == 128: - if video.get_duration() < 5: - raise MoonvalleyApiError( - "Input Video length is less than 5s. Please use a video longer than or equal to 5s." - ) - if video.get_duration() > 5: - # trim video to 5s - video = trim_video(video, 5) - if num_frames == 256: - if video.get_duration() < 10: - raise MoonvalleyApiError( - "Input Video length is less than 10s. Please use a video longer than or equal to 10s." - ) - if video.get_duration() > 10: - # trim video to 10s - video = trim_video(video, 10) +def _validate_and_trim_duration(video: VideoInput) -> VideoInput: + """Validates video duration and trims to 5 seconds if needed.""" + duration = video.get_duration() + _validate_minimum_duration(duration) + return _trim_if_too_long(video, duration) + + +def _validate_minimum_duration(duration: float) -> None: + """Ensures video is at least 5 seconds long.""" + if duration < 5: + raise MoonvalleyApiError("Input video must be at least 5 seconds long.") + + +def _trim_if_too_long(video: VideoInput, duration: float) -> VideoInput: + """Trims video to 5 seconds if longer.""" + if duration > 5: + return trim_video(video, 5) return video + def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: """ Returns a new VideoInput object trimmed from the beginning to the specified duration, @@ -278,15 +299,13 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels" ) - # Calculate target frame count that's divisible by 32 + # Calculate target frame count that's divisible by 16 fps = input_container.streams.video[0].average_rate estimated_frames = int(duration_sec * fps) - target_frames = ( - estimated_frames // 32 - ) * 32 # Round down to nearest multiple of 32 + target_frames = (estimated_frames // 16) * 16 # Round down to nearest multiple of 16 if target_frames == 0: - raise ValueError("Video too short: need at least 32 frames for Moonvalley") + raise ValueError("Video too short: need at least 16 frames for Moonvalley") frame_count = 0 audio_frame_count = 0 @@ -353,8 +372,8 @@ class BaseMoonvalleyVideoNode: "16:9 (1920 x 1080)": {"width": 1920, "height": 1080}, "9:16 (1080 x 1920)": {"width": 1080, "height": 1920}, "1:1 (1152 x 1152)": {"width": 1152, "height": 1152}, - "4:3 (1440 x 1080)": {"width": 1440, "height": 1080}, - "3:4 (1080 x 1440)": {"width": 1080, "height": 1440}, + "4:3 (1536 x 1152)": {"width": 1536, "height": 1152}, + "3:4 (1152 x 1536)": {"width": 1152, "height": 1536}, "21:9 (2560 x 1080)": {"width": 2560, "height": 1080}, } if resolution in res_map: @@ -494,7 +513,6 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): image = kwargs.get("image", None) if image is None: raise MoonvalleyApiError("image is required") - total_frames = get_total_frames_from_length() validate_input_image(image, True) validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) @@ -505,7 +523,7 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): steps=kwargs.get("steps"), seed=kwargs.get("seed"), guidance_scale=kwargs.get("prompt_adherence"), - num_frames=total_frames, + num_frames=128, width=width_height.get("width"), height=width_height.get("height"), use_negative_prompts=True, @@ -549,39 +567,45 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): @classmethod def INPUT_TYPES(cls): - input_types = super().INPUT_TYPES() - for param in ["resolution", "image"]: - if param in input_types["required"]: - del input_types["required"][param] - if param in input_types["optional"]: - del input_types["optional"][param] - input_types["optional"] = { - "video": ( - IO.VIDEO, - { - "default": "", - "multiline": False, - "tooltip": "The reference video used to generate the output video. Input a 5s video for 128 frames and a 10s video for 256 frames. Longer videos will be trimmed automatically.", - }, - ), - "control_type": ( - ["Motion Transfer", "Pose Transfer"], - {"default": "Motion Transfer"}, - ), - "motion_intensity": ( - "INT", - { - "default": 100, - "step": 1, - "min": 0, - "max": 100, - "tooltip": "Only used if control_type is 'Motion Transfer'", - }, - ), + return { + "required": { + "prompt": model_field_to_node_input( + IO.STRING, MoonvalleyVideoToVideoRequest, "prompt_text", + multiline=True + ), + "negative_prompt": model_field_to_node_input( + IO.STRING, + MoonvalleyVideoToVideoInferenceParams, + "negative_prompt", + multiline=True, + default="low-poly, flat shader, bad rigging, stiff animation, uncanny eyes, low-quality textures, looping glitch, cheap effect, overbloom, bloom spam, default lighting, game asset, stiff face, ugly specular, AI artifacts" + ), + "seed": model_field_to_node_input(IO.INT,MoonvalleyVideoToVideoInferenceParams, "seed", default=random.randint(0, 2**32 - 1), min=0, max=4294967295, step=1, display="number", tooltip="Random seed value", control_after_generate=True), + }, + "hidden": { + "auth_token": "AUTH_TOKEN_COMFY_ORG", + "comfy_api_key": "API_KEY_COMFY_ORG", + "unique_id": "UNIQUE_ID", + }, + "optional": { + "video": (IO.VIDEO, {"default": "", "multiline": False, "tooltip": "The reference video used to generate the output video. Must be at least 5 seconds long. Videos longer than 5s will be automatically trimmed. Only MP4 format supported."}), + "control_type": ( + ["Motion Transfer", "Pose Transfer"], + {"default": "Motion Transfer"}, + ), + "motion_intensity": ( + "INT", + { + "default": 100, + "step": 1, + "min": 0, + "max": 100, + "tooltip": "Only used if control_type is 'Motion Transfer'", + }, + ) + } } - return input_types - RETURN_TYPES = ("VIDEO",) RETURN_NAMES = ("video",) @@ -589,15 +613,13 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs ): video = kwargs.get("video") - num_frames = get_total_frames_from_length() if not video: raise MoonvalleyApiError("video is required") - """Validate video input""" video_url = "" if video: - validated_video = validate_input_video(video, num_frames, False) + validated_video = validate_video_to_video_input(video) video_url = upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs) control_type = kwargs.get("control_type") @@ -605,12 +627,16 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): """Validate prompts and inference input""" validate_prompts(prompt, negative_prompt) - inference_params = MoonvalleyVideoToVideoInferenceParams( + + # Only include motion_intensity for Motion Transfer + control_params = {} + if control_type == "Motion Transfer" and motion_intensity is not None: + control_params['motion_intensity'] = motion_intensity + + inference_params=MoonvalleyVideoToVideoInferenceParams( negative_prompt=negative_prompt, - steps=kwargs.get("steps"), seed=kwargs.get("seed"), - guidance_scale=kwargs.get("prompt_adherence"), - control_params={"motion_intensity": motion_intensity}, + control_params=control_params ) control = self.parseControlParameter(control_type) @@ -667,17 +693,16 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): ): validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) - num_frames = get_total_frames_from_length() - inference_params = MoonvalleyTextToVideoInferenceParams( - negative_prompt=negative_prompt, - steps=kwargs.get("steps"), - seed=kwargs.get("seed"), - guidance_scale=kwargs.get("prompt_adherence"), - num_frames=num_frames, - width=width_height.get("width"), - height=width_height.get("height"), - ) + inference_params=MoonvalleyTextToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + num_frames=128, + width=width_height.get("width"), + height=width_height.get("height"), + ) request = MoonvalleyTextToVideoRequest( prompt_text=prompt, inference_params=inference_params ) @@ -707,22 +732,12 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): NODE_CLASS_MAPPINGS = { "MoonvalleyImg2VideoNode": MoonvalleyImg2VideoNode, "MoonvalleyTxt2VideoNode": MoonvalleyTxt2VideoNode, - # "MoonvalleyVideo2VideoNode": MoonvalleyVideo2VideoNode, + "MoonvalleyVideo2VideoNode": MoonvalleyVideo2VideoNode, } NODE_DISPLAY_NAME_MAPPINGS = { "MoonvalleyImg2VideoNode": "Moonvalley Marey Image to Video", "MoonvalleyTxt2VideoNode": "Moonvalley Marey Text to Video", - # "MoonvalleyVideo2VideoNode": "Moonvalley Marey Video to Video", + "MoonvalleyVideo2VideoNode": "Moonvalley Marey Video to Video", } - - -def get_total_frames_from_length(length="5s"): - # if length == '5s': - # return 128 - # elif length == '10s': - # return 256 - return 128 - # else: - # raise MoonvalleyApiError("length is required") From d0210fe2e5df25b329926e20e3be32451fd5b841 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Mon, 28 Jul 2025 19:55:02 +0800 Subject: [PATCH 068/759] Update template to 0.1.41 (#9079) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 33a59b4be..14a085a2c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.40 +comfyui-workflow-templates==0.1.41 comfyui-embedded-docs==0.2.4 torch torchsde From a88788dce6b0d7b5e2876c7cd0121b45e80f4ad8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 28 Jul 2025 05:00:23 -0700 Subject: [PATCH 069/759] Wan 2.2 support. (#9080) --- comfy/latent_formats.py | 76 ++++ comfy/ldm/wan/model.py | 16 +- comfy/ldm/wan/vae2_2.py | 726 ++++++++++++++++++++++++++++++++++++++ comfy/model_base.py | 30 +- comfy/model_detection.py | 2 + comfy/sd.py | 36 +- comfy/supported_models.py | 15 +- comfy_extras/nodes_wan.py | 44 +++ 8 files changed, 926 insertions(+), 19 deletions(-) create mode 100644 comfy/ldm/wan/vae2_2.py diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 82d9f9bb8..caf4991fc 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -457,6 +457,82 @@ class Wan21(LatentFormat): latents_std = self.latents_std.to(latent.device, latent.dtype) return latent * latents_std / self.scale_factor + latents_mean +class Wan22(Wan21): + latent_channels = 48 + latent_dimensions = 3 + + latent_rgb_factors = [ + [ 0.0119, 0.0103, 0.0046], + [-0.1062, -0.0504, 0.0165], + [ 0.0140, 0.0409, 0.0491], + [-0.0813, -0.0677, 0.0607], + [ 0.0656, 0.0851, 0.0808], + [ 0.0264, 0.0463, 0.0912], + [ 0.0295, 0.0326, 0.0590], + [-0.0244, -0.0270, 0.0025], + [ 0.0443, -0.0102, 0.0288], + [-0.0465, -0.0090, -0.0205], + [ 0.0359, 0.0236, 0.0082], + [-0.0776, 0.0854, 0.1048], + [ 0.0564, 0.0264, 0.0561], + [ 0.0006, 0.0594, 0.0418], + [-0.0319, -0.0542, -0.0637], + [-0.0268, 0.0024, 0.0260], + [ 0.0539, 0.0265, 0.0358], + [-0.0359, -0.0312, -0.0287], + [-0.0285, -0.1032, -0.1237], + [ 0.1041, 0.0537, 0.0622], + [-0.0086, -0.0374, -0.0051], + [ 0.0390, 0.0670, 0.2863], + [ 0.0069, 0.0144, 0.0082], + [ 0.0006, -0.0167, 0.0079], + [ 0.0313, -0.0574, -0.0232], + [-0.1454, -0.0902, -0.0481], + [ 0.0714, 0.0827, 0.0447], + [-0.0304, -0.0574, -0.0196], + [ 0.0401, 0.0384, 0.0204], + [-0.0758, -0.0297, -0.0014], + [ 0.0568, 0.1307, 0.1372], + [-0.0055, -0.0310, -0.0380], + [ 0.0239, -0.0305, 0.0325], + [-0.0663, -0.0673, -0.0140], + [-0.0416, -0.0047, -0.0023], + [ 0.0166, 0.0112, -0.0093], + [-0.0211, 0.0011, 0.0331], + [ 0.1833, 0.1466, 0.2250], + [-0.0368, 0.0370, 0.0295], + [-0.3441, -0.3543, -0.2008], + [-0.0479, -0.0489, -0.0420], + [-0.0660, -0.0153, 0.0800], + [-0.0101, 0.0068, 0.0156], + [-0.0690, -0.0452, -0.0927], + [-0.0145, 0.0041, 0.0015], + [ 0.0421, 0.0451, 0.0373], + [ 0.0504, -0.0483, -0.0356], + [-0.0837, 0.0168, 0.0055] + ] + + latent_rgb_factors_bias = [0.0317, -0.0878, -0.1388] + + def __init__(self): + self.scale_factor = 1.0 + self.latents_mean = torch.tensor([ + -0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557, + -0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825, + -0.2246, -0.1207, -0.0698, 0.5109, 0.2665, -0.2108, -0.2158, 0.2502, + -0.2055, -0.0322, 0.1109, 0.1567, -0.0729, 0.0899, -0.2799, -0.1230, + -0.0313, -0.1649, 0.0117, 0.0723, -0.2839, -0.2083, -0.0520, 0.3748, + 0.0152, 0.1957, 0.1433, -0.2944, 0.3573, -0.0548, -0.1681, -0.0667, + ]).view(1, self.latent_channels, 1, 1, 1) + self.latents_std = torch.tensor([ + 0.4765, 1.0364, 0.4514, 1.1677, 0.5313, 0.4990, 0.4818, 0.5013, + 0.8158, 1.0344, 0.5894, 1.0901, 0.6885, 0.6165, 0.8454, 0.4978, + 0.5759, 0.3523, 0.7135, 0.6804, 0.5833, 1.4146, 0.8986, 0.5659, + 0.7069, 0.5338, 0.4889, 0.4917, 0.4069, 0.4999, 0.6866, 0.4093, + 0.5709, 0.6065, 0.6415, 0.4944, 0.5726, 1.2042, 0.5458, 1.6887, + 0.3971, 1.0600, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744 + ]).view(1, self.latent_channels, 1, 1, 1) + class Hunyuan3Dv2(LatentFormat): latent_channels = 64 latent_dimensions = 1 diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 1d6edb354..b9e47e9f7 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -201,8 +201,10 @@ class WanAttentionBlock(nn.Module): freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2] """ # assert e.dtype == torch.float32 - - e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1) + if e.ndim < 4: + e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1) + else: + e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e).unbind(2) # assert e[0].dtype == torch.float32 # self-attention @@ -325,7 +327,10 @@ class Head(nn.Module): e(Tensor): Shape [B, C] """ # assert e.dtype == torch.float32 - e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e.unsqueeze(1)).chunk(2, dim=1) + if e.ndim < 3: + e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e.unsqueeze(1)).chunk(2, dim=1) + else: + e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e.unsqueeze(2)).unbind(2) x = (self.head(self.norm(x) * (1 + e[1]) + e[0])) return x @@ -506,8 +511,9 @@ class WanModel(torch.nn.Module): # time embeddings e = self.time_embedding( - sinusoidal_embedding_1d(self.freq_dim, t).to(dtype=x[0].dtype)) - e0 = self.time_projection(e).unflatten(1, (6, self.dim)) + sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype)) + e = e.reshape(t.shape[0], -1, e.shape[-1]) + e0 = self.time_projection(e).unflatten(2, (6, self.dim)) # context context = self.text_embedding(context) diff --git a/comfy/ldm/wan/vae2_2.py b/comfy/ldm/wan/vae2_2.py new file mode 100644 index 000000000..c2c150e10 --- /dev/null +++ b/comfy/ldm/wan/vae2_2.py @@ -0,0 +1,726 @@ +# original version: https://github.com/Wan-Video/Wan2.2/blob/main/wan/modules/vae2_2.py +# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +from .vae import AttentionBlock, CausalConv3d, RMS_norm + +import comfy.ops +ops = comfy.ops.disable_weight_init + +CACHE_T = 2 + + +class Resample(nn.Module): + + def __init__(self, dim, mode): + assert mode in ( + "none", + "upsample2d", + "upsample3d", + "downsample2d", + "downsample3d", + ) + super().__init__() + self.dim = dim + self.mode = mode + + # layers + if mode == "upsample2d": + self.resample = nn.Sequential( + nn.Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), + ops.Conv2d(dim, dim, 3, padding=1), + ) + elif mode == "upsample3d": + self.resample = nn.Sequential( + nn.Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), + ops.Conv2d(dim, dim, 3, padding=1), + # ops.Conv2d(dim, dim//2, 3, padding=1) + ) + self.time_conv = CausalConv3d( + dim, dim * 2, (3, 1, 1), padding=(1, 0, 0)) + elif mode == "downsample2d": + self.resample = nn.Sequential( + nn.ZeroPad2d((0, 1, 0, 1)), + ops.Conv2d(dim, dim, 3, stride=(2, 2))) + elif mode == "downsample3d": + self.resample = nn.Sequential( + nn.ZeroPad2d((0, 1, 0, 1)), + ops.Conv2d(dim, dim, 3, stride=(2, 2))) + self.time_conv = CausalConv3d( + dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0)) + else: + self.resample = nn.Identity() + + def forward(self, x, feat_cache=None, feat_idx=[0]): + b, c, t, h, w = x.size() + if self.mode == "upsample3d": + if feat_cache is not None: + idx = feat_idx[0] + if feat_cache[idx] is None: + feat_cache[idx] = "Rep" + feat_idx[0] += 1 + else: + cache_x = x[:, :, -CACHE_T:, :, :].clone() + if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and + feat_cache[idx] != "Rep"): + # cache last frame of last two chunk + cache_x = torch.cat( + [ + feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to( + cache_x.device), + cache_x, + ], + dim=2, + ) + if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and + feat_cache[idx] == "Rep"): + cache_x = torch.cat( + [ + torch.zeros_like(cache_x).to(cache_x.device), + cache_x + ], + dim=2, + ) + if feat_cache[idx] == "Rep": + x = self.time_conv(x) + else: + x = self.time_conv(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + x = x.reshape(b, 2, c, t, h, w) + x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), + 3) + x = x.reshape(b, c, t * 2, h, w) + t = x.shape[2] + x = rearrange(x, "b c t h w -> (b t) c h w") + x = self.resample(x) + x = rearrange(x, "(b t) c h w -> b c t h w", t=t) + + if self.mode == "downsample3d": + if feat_cache is not None: + idx = feat_idx[0] + if feat_cache[idx] is None: + feat_cache[idx] = x.clone() + feat_idx[0] += 1 + else: + cache_x = x[:, :, -1:, :, :].clone() + x = self.time_conv( + torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2)) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + return x + + +class ResidualBlock(nn.Module): + + def __init__(self, in_dim, out_dim, dropout=0.0): + super().__init__() + self.in_dim = in_dim + self.out_dim = out_dim + + # layers + self.residual = nn.Sequential( + RMS_norm(in_dim, images=False), + nn.SiLU(), + CausalConv3d(in_dim, out_dim, 3, padding=1), + RMS_norm(out_dim, images=False), + nn.SiLU(), + nn.Dropout(dropout), + CausalConv3d(out_dim, out_dim, 3, padding=1), + ) + self.shortcut = ( + CausalConv3d(in_dim, out_dim, 1) + if in_dim != out_dim else nn.Identity()) + + def forward(self, x, feat_cache=None, feat_idx=[0]): + h = self.shortcut(x) + for layer in self.residual: + if isinstance(layer, CausalConv3d) and feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, -CACHE_T:, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + # cache last frame of last two chunk + cache_x = torch.cat( + [ + feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to( + cache_x.device), + cache_x, + ], + dim=2, + ) + x = layer(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = layer(x) + return x + h + + +def patchify(x, patch_size): + if patch_size == 1: + return x + if x.dim() == 4: + x = rearrange( + x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size) + elif x.dim() == 5: + x = rearrange( + x, + "b c f (h q) (w r) -> b (c r q) f h w", + q=patch_size, + r=patch_size, + ) + else: + raise ValueError(f"Invalid input shape: {x.shape}") + + return x + + +def unpatchify(x, patch_size): + if patch_size == 1: + return x + + if x.dim() == 4: + x = rearrange( + x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size) + elif x.dim() == 5: + x = rearrange( + x, + "b (c r q) f h w -> b c f (h q) (w r)", + q=patch_size, + r=patch_size, + ) + return x + + +class AvgDown3D(nn.Module): + + def __init__( + self, + in_channels, + out_channels, + factor_t, + factor_s=1, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.factor_t = factor_t + self.factor_s = factor_s + self.factor = self.factor_t * self.factor_s * self.factor_s + + assert in_channels * self.factor % out_channels == 0 + self.group_size = in_channels * self.factor // out_channels + + def forward(self, x: torch.Tensor) -> torch.Tensor: + pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t + pad = (0, 0, 0, 0, pad_t, 0) + x = F.pad(x, pad) + B, C, T, H, W = x.shape + x = x.view( + B, + C, + T // self.factor_t, + self.factor_t, + H // self.factor_s, + self.factor_s, + W // self.factor_s, + self.factor_s, + ) + x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous() + x = x.view( + B, + C * self.factor, + T // self.factor_t, + H // self.factor_s, + W // self.factor_s, + ) + x = x.view( + B, + self.out_channels, + self.group_size, + T // self.factor_t, + H // self.factor_s, + W // self.factor_s, + ) + x = x.mean(dim=2) + return x + + +class DupUp3D(nn.Module): + + def __init__( + self, + in_channels: int, + out_channels: int, + factor_t, + factor_s=1, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + + self.factor_t = factor_t + self.factor_s = factor_s + self.factor = self.factor_t * self.factor_s * self.factor_s + + assert out_channels * self.factor % in_channels == 0 + self.repeats = out_channels * self.factor // in_channels + + def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor: + x = x.repeat_interleave(self.repeats, dim=1) + x = x.view( + x.size(0), + self.out_channels, + self.factor_t, + self.factor_s, + self.factor_s, + x.size(2), + x.size(3), + x.size(4), + ) + x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous() + x = x.view( + x.size(0), + self.out_channels, + x.size(2) * self.factor_t, + x.size(4) * self.factor_s, + x.size(6) * self.factor_s, + ) + if first_chunk: + x = x[:, :, self.factor_t - 1:, :, :] + return x + + +class Down_ResidualBlock(nn.Module): + + def __init__(self, + in_dim, + out_dim, + dropout, + mult, + temperal_downsample=False, + down_flag=False): + super().__init__() + + # Shortcut path with downsample + self.avg_shortcut = AvgDown3D( + in_dim, + out_dim, + factor_t=2 if temperal_downsample else 1, + factor_s=2 if down_flag else 1, + ) + + # Main path with residual blocks and downsample + downsamples = [] + for _ in range(mult): + downsamples.append(ResidualBlock(in_dim, out_dim, dropout)) + in_dim = out_dim + + # Add the final downsample block + if down_flag: + mode = "downsample3d" if temperal_downsample else "downsample2d" + downsamples.append(Resample(out_dim, mode=mode)) + + self.downsamples = nn.Sequential(*downsamples) + + def forward(self, x, feat_cache=None, feat_idx=[0]): + x_copy = x.clone() + for module in self.downsamples: + x = module(x, feat_cache, feat_idx) + + return x + self.avg_shortcut(x_copy) + + +class Up_ResidualBlock(nn.Module): + + def __init__(self, + in_dim, + out_dim, + dropout, + mult, + temperal_upsample=False, + up_flag=False): + super().__init__() + # Shortcut path with upsample + if up_flag: + self.avg_shortcut = DupUp3D( + in_dim, + out_dim, + factor_t=2 if temperal_upsample else 1, + factor_s=2 if up_flag else 1, + ) + else: + self.avg_shortcut = None + + # Main path with residual blocks and upsample + upsamples = [] + for _ in range(mult): + upsamples.append(ResidualBlock(in_dim, out_dim, dropout)) + in_dim = out_dim + + # Add the final upsample block + if up_flag: + mode = "upsample3d" if temperal_upsample else "upsample2d" + upsamples.append(Resample(out_dim, mode=mode)) + + self.upsamples = nn.Sequential(*upsamples) + + def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False): + x_main = x.clone() + for module in self.upsamples: + x_main = module(x_main, feat_cache, feat_idx) + if self.avg_shortcut is not None: + x_shortcut = self.avg_shortcut(x, first_chunk) + return x_main + x_shortcut + else: + return x_main + + +class Encoder3d(nn.Module): + + def __init__( + self, + dim=128, + z_dim=4, + dim_mult=[1, 2, 4, 4], + num_res_blocks=2, + attn_scales=[], + temperal_downsample=[True, True, False], + dropout=0.0, + ): + super().__init__() + self.dim = dim + self.z_dim = z_dim + self.dim_mult = dim_mult + self.num_res_blocks = num_res_blocks + self.attn_scales = attn_scales + self.temperal_downsample = temperal_downsample + + # dimensions + dims = [dim * u for u in [1] + dim_mult] + scale = 1.0 + + # init block + self.conv1 = CausalConv3d(12, dims[0], 3, padding=1) + + # downsample blocks + downsamples = [] + for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])): + t_down_flag = ( + temperal_downsample[i] + if i < len(temperal_downsample) else False) + downsamples.append( + Down_ResidualBlock( + in_dim=in_dim, + out_dim=out_dim, + dropout=dropout, + mult=num_res_blocks, + temperal_downsample=t_down_flag, + down_flag=i != len(dim_mult) - 1, + )) + scale /= 2.0 + self.downsamples = nn.Sequential(*downsamples) + + # middle blocks + self.middle = nn.Sequential( + ResidualBlock(out_dim, out_dim, dropout), + AttentionBlock(out_dim), + ResidualBlock(out_dim, out_dim, dropout), + ) + + # # output blocks + self.head = nn.Sequential( + RMS_norm(out_dim, images=False), + nn.SiLU(), + CausalConv3d(out_dim, z_dim, 3, padding=1), + ) + + def forward(self, x, feat_cache=None, feat_idx=[0]): + + if feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, -CACHE_T:, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + cache_x = torch.cat( + [ + feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to( + cache_x.device), + cache_x, + ], + dim=2, + ) + x = self.conv1(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = self.conv1(x) + + ## downsamples + for layer in self.downsamples: + if feat_cache is not None: + x = layer(x, feat_cache, feat_idx) + else: + x = layer(x) + + ## middle + for layer in self.middle: + if isinstance(layer, ResidualBlock) and feat_cache is not None: + x = layer(x, feat_cache, feat_idx) + else: + x = layer(x) + + ## head + for layer in self.head: + if isinstance(layer, CausalConv3d) and feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, -CACHE_T:, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + cache_x = torch.cat( + [ + feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to( + cache_x.device), + cache_x, + ], + dim=2, + ) + x = layer(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = layer(x) + + return x + + +class Decoder3d(nn.Module): + + def __init__( + self, + dim=128, + z_dim=4, + dim_mult=[1, 2, 4, 4], + num_res_blocks=2, + attn_scales=[], + temperal_upsample=[False, True, True], + dropout=0.0, + ): + super().__init__() + self.dim = dim + self.z_dim = z_dim + self.dim_mult = dim_mult + self.num_res_blocks = num_res_blocks + self.attn_scales = attn_scales + self.temperal_upsample = temperal_upsample + + # dimensions + dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]] + # init block + self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1) + + # middle blocks + self.middle = nn.Sequential( + ResidualBlock(dims[0], dims[0], dropout), + AttentionBlock(dims[0]), + ResidualBlock(dims[0], dims[0], dropout), + ) + + # upsample blocks + upsamples = [] + for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])): + t_up_flag = temperal_upsample[i] if i < len( + temperal_upsample) else False + upsamples.append( + Up_ResidualBlock( + in_dim=in_dim, + out_dim=out_dim, + dropout=dropout, + mult=num_res_blocks + 1, + temperal_upsample=t_up_flag, + up_flag=i != len(dim_mult) - 1, + )) + self.upsamples = nn.Sequential(*upsamples) + + # output blocks + self.head = nn.Sequential( + RMS_norm(out_dim, images=False), + nn.SiLU(), + CausalConv3d(out_dim, 12, 3, padding=1), + ) + + def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False): + if feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, -CACHE_T:, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + cache_x = torch.cat( + [ + feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to( + cache_x.device), + cache_x, + ], + dim=2, + ) + x = self.conv1(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = self.conv1(x) + + for layer in self.middle: + if isinstance(layer, ResidualBlock) and feat_cache is not None: + x = layer(x, feat_cache, feat_idx) + else: + x = layer(x) + + ## upsamples + for layer in self.upsamples: + if feat_cache is not None: + x = layer(x, feat_cache, feat_idx, first_chunk) + else: + x = layer(x) + + ## head + for layer in self.head: + if isinstance(layer, CausalConv3d) and feat_cache is not None: + idx = feat_idx[0] + cache_x = x[:, :, -CACHE_T:, :, :].clone() + if cache_x.shape[2] < 2 and feat_cache[idx] is not None: + cache_x = torch.cat( + [ + feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to( + cache_x.device), + cache_x, + ], + dim=2, + ) + x = layer(x, feat_cache[idx]) + feat_cache[idx] = cache_x + feat_idx[0] += 1 + else: + x = layer(x) + return x + + +def count_conv3d(model): + count = 0 + for m in model.modules(): + if isinstance(m, CausalConv3d): + count += 1 + return count + + +class WanVAE(nn.Module): + + def __init__( + self, + dim=160, + dec_dim=256, + z_dim=16, + dim_mult=[1, 2, 4, 4], + num_res_blocks=2, + attn_scales=[], + temperal_downsample=[True, True, False], + dropout=0.0, + ): + super().__init__() + self.dim = dim + self.z_dim = z_dim + self.dim_mult = dim_mult + self.num_res_blocks = num_res_blocks + self.attn_scales = attn_scales + self.temperal_downsample = temperal_downsample + self.temperal_upsample = temperal_downsample[::-1] + + # modules + self.encoder = Encoder3d( + dim, + z_dim * 2, + dim_mult, + num_res_blocks, + attn_scales, + self.temperal_downsample, + dropout, + ) + self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1) + self.conv2 = CausalConv3d(z_dim, z_dim, 1) + self.decoder = Decoder3d( + dec_dim, + z_dim, + dim_mult, + num_res_blocks, + attn_scales, + self.temperal_upsample, + dropout, + ) + + def encode(self, x): + self.clear_cache() + x = patchify(x, patch_size=2) + t = x.shape[2] + iter_ = 1 + (t - 1) // 4 + for i in range(iter_): + self._enc_conv_idx = [0] + if i == 0: + out = self.encoder( + x[:, :, :1, :, :], + feat_cache=self._enc_feat_map, + feat_idx=self._enc_conv_idx, + ) + else: + out_ = self.encoder( + x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :], + feat_cache=self._enc_feat_map, + feat_idx=self._enc_conv_idx, + ) + out = torch.cat([out, out_], 2) + mu, log_var = self.conv1(out).chunk(2, dim=1) + self.clear_cache() + return mu + + def decode(self, z): + self.clear_cache() + iter_ = z.shape[2] + x = self.conv2(z) + for i in range(iter_): + self._conv_idx = [0] + if i == 0: + out = self.decoder( + x[:, :, i:i + 1, :, :], + feat_cache=self._feat_map, + feat_idx=self._conv_idx, + first_chunk=True, + ) + else: + out_ = self.decoder( + x[:, :, i:i + 1, :, :], + feat_cache=self._feat_map, + feat_idx=self._conv_idx, + ) + out = torch.cat([out, out_], 2) + out = unpatchify(out, patch_size=2) + self.clear_cache() + return out + + def reparameterize(self, mu, log_var): + std = torch.exp(0.5 * log_var) + eps = torch.randn_like(std) + return eps * std + mu + + def sample(self, imgs, deterministic=False): + mu, log_var = self.encode(imgs) + if deterministic: + return mu + std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0)) + return mu + std * torch.randn_like(std) + + def clear_cache(self): + self._conv_num = count_conv3d(self.decoder) + self._conv_idx = [0] + self._feat_map = [None] * self._conv_num + # cache encode + self._enc_conv_num = count_conv3d(self.encoder) + self._enc_conv_idx = [0] + self._enc_feat_map = [None] * self._enc_conv_num diff --git a/comfy/model_base.py b/comfy/model_base.py index 4392355ea..d019b991a 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1097,8 +1097,9 @@ class WAN21(BaseModel): image[:, i: i + 16] = self.process_latent_in(image[:, i: i + 16]) image = utils.resize_to_batch_size(image, noise.shape[0]) - if not self.image_to_video or extra_channels == image.shape[1]: - return image + if extra_channels != image.shape[1] + 4: + if not self.image_to_video or extra_channels == image.shape[1]: + return image if image.shape[1] > (extra_channels - 4): image = image[:, :(extra_channels - 4)] @@ -1182,6 +1183,31 @@ class WAN21_Camera(WAN21): out['camera_conditions'] = comfy.conds.CONDRegular(camera_conditions) return out +class WAN22(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) + self.image_to_video = image_to_video + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + + denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None)) + if denoise_mask is not None: + out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask) + return out + + def process_timestep(self, timestep, x, denoise_mask=None, **kwargs): + if denoise_mask is None: + return timestep + temp_ts = (torch.mean(denoise_mask[:, :, :, ::2, ::2], dim=1, keepdim=True) * timestep.view([timestep.shape[0]] + [1] * (denoise_mask.ndim - 1))).reshape(timestep.shape[0], -1) + return temp_ts + + def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs): + return latent_image + class Hunyuan3Dv2(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan3d.model.Hunyuan3Dv2) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 18232ade3..9fc1f42de 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -346,7 +346,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config = {} dit_config["image_model"] = "wan2.1" dim = state_dict['{}head.modulation'.format(key_prefix)].shape[-1] + out_dim = state_dict['{}head.head.weight'.format(key_prefix)].shape[0] // 4 dit_config["dim"] = dim + dit_config["out_dim"] = out_dim dit_config["num_heads"] = dim // 128 dit_config["ffn_dim"] = state_dict['{}blocks.0.ffn.0.weight'.format(key_prefix)].shape[0] dit_config["num_layers"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.') diff --git a/comfy/sd.py b/comfy/sd.py index 8081b167c..e0498e585 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -14,6 +14,7 @@ import comfy.ldm.genmo.vae.model import comfy.ldm.lightricks.vae.causal_video_autoencoder import comfy.ldm.cosmos.vae import comfy.ldm.wan.vae +import comfy.ldm.wan.vae2_2 import comfy.ldm.hunyuan3d.vae import comfy.ldm.ace.vae.music_dcae_pipeline import yaml @@ -420,17 +421,30 @@ class VAE: self.memory_used_encode = lambda shape, dtype: (50 * (round((shape[2] + 7) / 8) * 8) * shape[3] * shape[4]) * model_management.dtype_size(dtype) self.working_dtypes = [torch.bfloat16, torch.float32] elif "decoder.middle.0.residual.0.gamma" in sd: - self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8) - self.upscale_index_formula = (4, 8, 8) - self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8) - self.downscale_index_formula = (4, 8, 8) - self.latent_dim = 3 - self.latent_channels = 16 - ddconfig = {"dim": 96, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0} - self.first_stage_model = comfy.ldm.wan.vae.WanVAE(**ddconfig) - self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] - self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype) - self.memory_used_decode = lambda shape, dtype: 7000 * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype) + if "decoder.upsamples.0.upsamples.0.residual.2.weight" in sd: # Wan 2.2 VAE + self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16) + self.upscale_index_formula = (4, 16, 16) + self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16) + self.downscale_index_formula = (4, 16, 16) + self.latent_dim = 3 + self.latent_channels = 48 + ddconfig = {"dim": 160, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0} + self.first_stage_model = comfy.ldm.wan.vae2_2.WanVAE(**ddconfig) + self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] + self.memory_used_encode = lambda shape, dtype: 3300 * shape[3] * shape[4] * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: 8000 * shape[3] * shape[4] * (16 * 16) * model_management.dtype_size(dtype) + else: # Wan 2.1 VAE + self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8) + self.upscale_index_formula = (4, 8, 8) + self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8) + self.downscale_index_formula = (4, 8, 8) + self.latent_dim = 3 + self.latent_channels = 16 + ddconfig = {"dim": 96, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0} + self.first_stage_model = comfy.ldm.wan.vae.WanVAE(**ddconfig) + self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] + self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: 7000 * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype) elif "geo_decoder.cross_attn_decoder.ln_1.bias" in sd: self.latent_dim = 1 ln_post = "geo_decoder.ln_post.weight" in sd diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 2ca3857f7..8f3f4652d 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1059,6 +1059,19 @@ class WAN21_Vace(WAN21_T2V): out = model_base.WAN21_Vace(self, image_to_video=False, device=device) return out +class WAN22_T2V(WAN21_T2V): + unet_config = { + "image_model": "wan2.1", + "model_type": "t2v", + "out_dim": 48, + } + + latent_format = latent_formats.Wan22 + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.WAN22(self, image_to_video=True, device=device) + return out + class Hunyuan3Dv2(supported_models_base.BASE): unet_config = { "image_model": "hunyuan3d2", @@ -1217,6 +1230,6 @@ class Omnigen2(supported_models_base.BASE): return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index d71908f31..0b92c68ac 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -685,6 +685,49 @@ class WanTrackToVideo: out_latent["samples"] = latent return (positive, negative, out_latent) + +class Wan22ImageToVideoLatent: + @classmethod + def INPUT_TYPES(s): + return {"required": {"vae": ("VAE", ), + "width": ("INT", {"default": 1280, "min": 32, "max": nodes.MAX_RESOLUTION, "step": 32}), + "height": ("INT", {"default": 704, "min": 32, "max": nodes.MAX_RESOLUTION, "step": 32}), + "length": ("INT", {"default": 49, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), + "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), + }, + "optional": {"start_image": ("IMAGE", ), + }} + + + RETURN_TYPES = ("LATENT",) + FUNCTION = "encode" + + CATEGORY = "conditioning/inpaint" + + def encode(self, vae, width, height, length, batch_size, start_image=None): + latent = torch.zeros([1, 48, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device()) + + if start_image is None: + out_latent = {} + out_latent["samples"] = latent + return (out_latent,) + + mask = torch.ones([latent.shape[0], 1, ((length - 1) // 4) + 1, latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) + + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + latent_temp = vae.encode(start_image) + latent[:, :, :latent_temp.shape[-3]] = latent_temp + mask[:, :, :latent_temp.shape[-3]] *= 0.0 + + out_latent = {} + latent_format = comfy.latent_formats.Wan22() + latent = latent_format.process_out(latent) * mask + latent * (1.0 - mask) + out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1)) + out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1)) + return (out_latent,) + + NODE_CLASS_MAPPINGS = { "WanTrackToVideo": WanTrackToVideo, "WanImageToVideo": WanImageToVideo, @@ -695,4 +738,5 @@ NODE_CLASS_MAPPINGS = { "TrimVideoLatent": TrimVideoLatent, "WanCameraImageToVideo": WanCameraImageToVideo, "WanPhantomSubjectToVideo": WanPhantomSubjectToVideo, + "Wan22ImageToVideoLatent": Wan22ImageToVideoLatent, } From 9f1388c0a38b9b6ebde0cdde904d94d709d3ca82 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 28 Jul 2025 05:01:53 -0700 Subject: [PATCH 070/759] Add wan2.2 to readme. (#9081) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8a15136aa..befc4c006 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/) - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/) and [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/) - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/) + - [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/) - Audio Models - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/) - [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/) From 5d4cc3ba1b412b9acacd37fd23d59e0e1654f83c Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 28 Jul 2025 08:04:04 -0400 Subject: [PATCH 071/759] ComfyUI 0.3.46 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 180ecaf8a..315710dd2 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.45" +__version__ = "0.3.46" diff --git a/pyproject.toml b/pyproject.toml index b1d6d9df6..59c4c70fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.45" +version = "0.3.46" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From c60dc4177c16d50da025bda4ac7fd513bb86e699 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 28 Jul 2025 11:48:19 -0700 Subject: [PATCH 072/759] Remove unecessary clones in the wan2.2 VAE. (#9083) --- comfy/ldm/wan/vae2_2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comfy/ldm/wan/vae2_2.py b/comfy/ldm/wan/vae2_2.py index c2c150e10..b9c2d1a26 100644 --- a/comfy/ldm/wan/vae2_2.py +++ b/comfy/ldm/wan/vae2_2.py @@ -136,7 +136,7 @@ class ResidualBlock(nn.Module): if in_dim != out_dim else nn.Identity()) def forward(self, x, feat_cache=None, feat_idx=[0]): - h = self.shortcut(x) + old_x = x for layer in self.residual: if isinstance(layer, CausalConv3d) and feat_cache is not None: idx = feat_idx[0] @@ -156,7 +156,7 @@ class ResidualBlock(nn.Module): feat_idx[0] += 1 else: x = layer(x) - return x + h + return x + self.shortcut(old_x) def patchify(x, patch_size): @@ -327,7 +327,7 @@ class Down_ResidualBlock(nn.Module): self.downsamples = nn.Sequential(*downsamples) def forward(self, x, feat_cache=None, feat_idx=[0]): - x_copy = x.clone() + x_copy = x for module in self.downsamples: x = module(x, feat_cache, feat_idx) @@ -369,7 +369,7 @@ class Up_ResidualBlock(nn.Module): self.upsamples = nn.Sequential(*upsamples) def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False): - x_main = x.clone() + x_main = x for module in self.upsamples: x_main = module(x_main, feat_cache, feat_idx) if self.avg_shortcut is not None: From 7d593baf919f468670425c0d9068ead8a3e9b05f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 29 Jul 2025 01:07:45 -0700 Subject: [PATCH 073/759] Extra reserved vram on large cards on windows. (#9093) --- comfy/model_management.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/model_management.py b/comfy/model_management.py index 232d363aa..9e6149d60 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -529,6 +529,8 @@ WINDOWS = any(platform.win32_ver()) EXTRA_RESERVED_VRAM = 400 * 1024 * 1024 if WINDOWS: EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue + if total_vram > (15 * 1024): # more extra reserved vram on 16GB+ cards + EXTRA_RESERVED_VRAM += 100 * 1024 * 1024 if args.reserve_vram is not None: EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024 From dca6bdd4fa70770f910ea80e61c72e1f450530d9 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 29 Jul 2025 16:44:18 -0700 Subject: [PATCH 074/759] Make wan2.2 5B i2v take a lot less memory. (#9102) --- comfy/ldm/wan/model.py | 21 ++++++++++++++++----- comfy/model_base.py | 2 +- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index b9e47e9f7..a93a13c86 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -146,6 +146,15 @@ WAN_CROSSATTENTION_CLASSES = { } +def repeat_e(e, x): + repeats = 1 + if e.shape[1] > 1: + repeats = x.shape[1] // e.shape[1] + if repeats == 1: + return e + return torch.repeat_interleave(e, repeats, dim=1) + + class WanAttentionBlock(nn.Module): def __init__(self, @@ -201,6 +210,7 @@ class WanAttentionBlock(nn.Module): freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2] """ # assert e.dtype == torch.float32 + if e.ndim < 4: e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1) else: @@ -209,15 +219,15 @@ class WanAttentionBlock(nn.Module): # self-attention y = self.self_attn( - self.norm1(x) * (1 + e[1]) + e[0], + self.norm1(x) * (1 + repeat_e(e[1], x)) + repeat_e(e[0], x), freqs) - x = x + y * e[2] + x = x + y * repeat_e(e[2], x) # cross-attention & ffn x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len) - y = self.ffn(self.norm2(x) * (1 + e[4]) + e[3]) - x = x + y * e[5] + y = self.ffn(self.norm2(x) * (1 + repeat_e(e[4], x)) + repeat_e(e[3], x)) + x = x + y * repeat_e(e[5], x) return x @@ -331,7 +341,8 @@ class Head(nn.Module): e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e.unsqueeze(1)).chunk(2, dim=1) else: e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e.unsqueeze(2)).unbind(2) - x = (self.head(self.norm(x) * (1 + e[1]) + e[0])) + + x = (self.head(self.norm(x) * (1 + repeat_e(e[1], x)) + repeat_e(e[0], x))) return x diff --git a/comfy/model_base.py b/comfy/model_base.py index d019b991a..6b7978949 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1202,7 +1202,7 @@ class WAN22(BaseModel): def process_timestep(self, timestep, x, denoise_mask=None, **kwargs): if denoise_mask is None: return timestep - temp_ts = (torch.mean(denoise_mask[:, :, :, ::2, ::2], dim=1, keepdim=True) * timestep.view([timestep.shape[0]] + [1] * (denoise_mask.ndim - 1))).reshape(timestep.shape[0], -1) + temp_ts = (torch.mean(denoise_mask[:, :, :, :, :], dim=(1, 3, 4), keepdim=True) * timestep.view([timestep.shape[0]] + [1] * (denoise_mask.ndim - 1))).reshape(timestep.shape[0], -1) return temp_ts def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs): From 2f74e17975696d829af455845c584574bbc85774 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 29 Jul 2025 20:08:25 -0400 Subject: [PATCH 075/759] ComfyUI version 0.3.47 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 315710dd2..20a2e892a 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.46" +__version__ = "0.3.47" diff --git a/pyproject.toml b/pyproject.toml index 59c4c70fb..723c93069 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.46" +version = "0.3.47" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 0a3d062e0660741146d50f6601e3eeca211d92d5 Mon Sep 17 00:00:00 2001 From: guill Date: Tue, 29 Jul 2025 19:17:22 -0700 Subject: [PATCH 076/759] ComfyAPI Core v0.0.2 (#8962) * ComfyAPI Core v0.0.2 * Respond to PR feedback * Fix Python 3.9 errors * Fix missing backward compatibility proxy * Reorganize types a bit The input types, input impls, and utility types are now all available in the versioned API. See the change in `comfy_extras/nodes_video.py` for an example of their usage. * Remove the need for `--generate-api-stubs` * Fix generated stubs differing by Python version * Fix ruff formatting issues --- comfy_api/generate_api_stubs.py | 86 ++ comfy_api/input/__init__.py | 12 +- comfy_api/input/basic_types.py | 32 +- comfy_api/input/video_types.py | 89 +- comfy_api/input_impl/__init__.py | 4 +- comfy_api/input_impl/video_types.py | 326 +----- comfy_api/internal/__init__.py | 7 + comfy_api/internal/api_registry.py | 39 + comfy_api/internal/async_to_sync.py | 987 ++++++++++++++++++ comfy_api/internal/singleton.py | 33 + comfy_api/latest/__init__.py | 106 ++ comfy_api/latest/_input/__init__.py | 10 + comfy_api/latest/_input/basic_types.py | 42 + comfy_api/latest/_input/video_types.py | 85 ++ comfy_api/latest/_input_impl/__init__.py | 7 + comfy_api/latest/_input_impl/video_types.py | 324 ++++++ comfy_api/latest/_util/__init__.py | 8 + comfy_api/latest/_util/video_types.py | 52 + .../latest/generated/ComfyAPISyncStub.pyi | 20 + comfy_api/util.py | 8 + comfy_api/util/__init__.py | 4 +- comfy_api/util/video_types.py | 61 +- comfy_api/v0_0_1/__init__.py | 42 + .../v0_0_1/generated/ComfyAPISyncStub.pyi | 20 + comfy_api/v0_0_2/__init__.py | 43 + .../v0_0_2/generated/ComfyAPISyncStub.pyi | 20 + comfy_api/version_list.py | 12 + comfy_api_nodes/apis/request_logger.py | 2 + comfy_api_nodes/nodes_gemini.py | 2 + comfy_execution/progress.py | 16 +- comfy_extras/nodes_video.py | 23 +- main.py | 4 +- nodes.py | 36 +- pyproject.toml | 2 +- .../testing_nodes/testing-pack/__init__.py | 4 +- .../testing-pack/api_test_nodes.py | 78 ++ 36 files changed, 2128 insertions(+), 518 deletions(-) create mode 100644 comfy_api/generate_api_stubs.py create mode 100644 comfy_api/internal/__init__.py create mode 100644 comfy_api/internal/api_registry.py create mode 100644 comfy_api/internal/async_to_sync.py create mode 100644 comfy_api/internal/singleton.py create mode 100644 comfy_api/latest/__init__.py create mode 100644 comfy_api/latest/_input/__init__.py create mode 100644 comfy_api/latest/_input/basic_types.py create mode 100644 comfy_api/latest/_input/video_types.py create mode 100644 comfy_api/latest/_input_impl/__init__.py create mode 100644 comfy_api/latest/_input_impl/video_types.py create mode 100644 comfy_api/latest/_util/__init__.py create mode 100644 comfy_api/latest/_util/video_types.py create mode 100644 comfy_api/latest/generated/ComfyAPISyncStub.pyi create mode 100644 comfy_api/util.py create mode 100644 comfy_api/v0_0_1/__init__.py create mode 100644 comfy_api/v0_0_1/generated/ComfyAPISyncStub.pyi create mode 100644 comfy_api/v0_0_2/__init__.py create mode 100644 comfy_api/v0_0_2/generated/ComfyAPISyncStub.pyi create mode 100644 comfy_api/version_list.py create mode 100644 tests/inference/testing_nodes/testing-pack/api_test_nodes.py diff --git a/comfy_api/generate_api_stubs.py b/comfy_api/generate_api_stubs.py new file mode 100644 index 000000000..604a7eced --- /dev/null +++ b/comfy_api/generate_api_stubs.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +""" +Script to generate .pyi stub files for the synchronous API wrappers. +This allows generating stubs without running the full ComfyUI application. +""" + +import os +import sys +import logging +import importlib + +# Add ComfyUI to path so we can import modules +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from comfy_api.internal.async_to_sync import AsyncToSyncConverter +from comfy_api.version_list import supported_versions + + +def generate_stubs_for_module(module_name: str) -> None: + """Generate stub files for a specific module that exports ComfyAPI and ComfyAPISync.""" + try: + # Import the module + module = importlib.import_module(module_name) + + # Check if module has ComfyAPISync (the sync wrapper) + if hasattr(module, "ComfyAPISync"): + # Module already has a sync class + api_class = getattr(module, "ComfyAPI", None) + sync_class = getattr(module, "ComfyAPISync") + + if api_class: + # Generate the stub file + AsyncToSyncConverter.generate_stub_file(api_class, sync_class) + logging.info(f"Generated stub file for {module_name}") + else: + logging.warning( + f"Module {module_name} has ComfyAPISync but no ComfyAPI" + ) + + elif hasattr(module, "ComfyAPI"): + # Module only has async API, need to create sync wrapper first + from comfy_api.internal.async_to_sync import create_sync_class + + api_class = getattr(module, "ComfyAPI") + sync_class = create_sync_class(api_class) + + # Generate the stub file + AsyncToSyncConverter.generate_stub_file(api_class, sync_class) + logging.info(f"Generated stub file for {module_name}") + else: + logging.warning( + f"Module {module_name} does not export ComfyAPI or ComfyAPISync" + ) + + except Exception as e: + logging.error(f"Failed to generate stub for {module_name}: {e}") + import traceback + + traceback.print_exc() + + +def main(): + """Main function to generate all API stub files.""" + logging.basicConfig(level=logging.INFO) + + logging.info("Starting stub generation...") + + # Dynamically get module names from supported_versions + api_modules = [] + for api_class in supported_versions: + # Extract module name from the class + module_name = api_class.__module__ + if module_name not in api_modules: + api_modules.append(module_name) + + logging.info(f"Found {len(api_modules)} API modules: {api_modules}") + + # Generate stubs for each module + for module_name in api_modules: + generate_stubs_for_module(module_name) + + logging.info("Stub generation complete!") + + +if __name__ == "__main__": + main() diff --git a/comfy_api/input/__init__.py b/comfy_api/input/__init__.py index 66667946f..68ff78270 100644 --- a/comfy_api/input/__init__.py +++ b/comfy_api/input/__init__.py @@ -1,8 +1,16 @@ -from .basic_types import ImageInput, AudioInput -from .video_types import VideoInput +# This file only exists for backwards compatibility. +from comfy_api.latest._input import ( + ImageInput, + AudioInput, + MaskInput, + LatentInput, + VideoInput, +) __all__ = [ "ImageInput", "AudioInput", + "MaskInput", + "LatentInput", "VideoInput", ] diff --git a/comfy_api/input/basic_types.py b/comfy_api/input/basic_types.py index 033fb7e27..5eadce86a 100644 --- a/comfy_api/input/basic_types.py +++ b/comfy_api/input/basic_types.py @@ -1,20 +1,14 @@ -import torch -from typing import TypedDict - -ImageInput = torch.Tensor -""" -An image in format [B, H, W, C] where B is the batch size, C is the number of channels, -""" - -class AudioInput(TypedDict): - """ - TypedDict representing audio input. - """ - - waveform: torch.Tensor - """ - Tensor in the format [B, C, T] where B is the batch size, C is the number of channels, - """ - - sample_rate: int +# This file only exists for backwards compatibility. +from comfy_api.latest._input.basic_types import ( + ImageInput, + AudioInput, + MaskInput, + LatentInput, +) +__all__ = [ + "ImageInput", + "AudioInput", + "MaskInput", + "LatentInput", +] diff --git a/comfy_api/input/video_types.py b/comfy_api/input/video_types.py index 5d95dc507..9ace78cbc 100644 --- a/comfy_api/input/video_types.py +++ b/comfy_api/input/video_types.py @@ -1,85 +1,6 @@ -from __future__ import annotations -from abc import ABC, abstractmethod -from typing import Optional, Union -import io -import av -from comfy_api.util import VideoContainer, VideoCodec, VideoComponents +# This file only exists for backwards compatibility. +from comfy_api.latest._input.video_types import VideoInput -class VideoInput(ABC): - """ - Abstract base class for video input types. - """ - - @abstractmethod - def get_components(self) -> VideoComponents: - """ - Abstract method to get the video components (images, audio, and frame rate). - - Returns: - VideoComponents containing images, audio, and frame rate - """ - pass - - @abstractmethod - def save_to( - self, - path: str, - format: VideoContainer = VideoContainer.AUTO, - codec: VideoCodec = VideoCodec.AUTO, - metadata: Optional[dict] = None - ): - """ - Abstract method to save the video input to a file. - """ - pass - - def get_stream_source(self) -> Union[str, io.BytesIO]: - """ - Get a streamable source for the video. This allows processing without - loading the entire video into memory. - - Returns: - Either a file path (str) or a BytesIO object that can be opened with av. - - Default implementation creates a BytesIO buffer, but subclasses should - override this for better performance when possible. - """ - buffer = io.BytesIO() - self.save_to(buffer) - buffer.seek(0) - return buffer - - # Provide a default implementation, but subclasses can provide optimized versions - # if possible. - def get_dimensions(self) -> tuple[int, int]: - """ - Returns the dimensions of the video input. - - Returns: - Tuple of (width, height) - """ - components = self.get_components() - return components.images.shape[2], components.images.shape[1] - - def get_duration(self) -> float: - """ - Returns the duration of the video in seconds. - - Returns: - Duration in seconds - """ - components = self.get_components() - frame_count = components.images.shape[0] - return float(frame_count / components.frame_rate) - - def get_container_format(self) -> str: - """ - Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). - - Returns: - Container format as string - """ - # Default implementation - subclasses should override for better performance - source = self.get_stream_source() - with av.open(source, mode="r") as container: - return container.format.name +__all__ = [ + "VideoInput", +] diff --git a/comfy_api/input_impl/__init__.py b/comfy_api/input_impl/__init__.py index 02901b8b9..b78ff0c08 100644 --- a/comfy_api/input_impl/__init__.py +++ b/comfy_api/input_impl/__init__.py @@ -1,7 +1,7 @@ -from .video_types import VideoFromFile, VideoFromComponents +# This file only exists for backwards compatibility. +from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents __all__ = [ - # Implementations "VideoFromFile", "VideoFromComponents", ] diff --git a/comfy_api/input_impl/video_types.py b/comfy_api/input_impl/video_types.py index 91e7c1bfa..bd2e56ad5 100644 --- a/comfy_api/input_impl/video_types.py +++ b/comfy_api/input_impl/video_types.py @@ -1,324 +1,2 @@ -from __future__ import annotations -from av.container import InputContainer -from av.subtitles.stream import SubtitleStream -from fractions import Fraction -from typing import Optional -from comfy_api.input import AudioInput -import av -import io -import json -import numpy as np -import torch -from comfy_api.input import VideoInput -from comfy_api.util import VideoContainer, VideoCodec, VideoComponents - - -def container_to_output_format(container_format: str | None) -> str | None: - """ - A container's `format` may be a comma-separated list of formats. - E.g., iso container's `format` may be `mov,mp4,m4a,3gp,3g2,mj2`. - However, writing to a file/stream with `av.open` requires a single format, - or `None` to auto-detect. - """ - if not container_format: - return None # Auto-detect - - if "," not in container_format: - return container_format - - formats = container_format.split(",") - return formats[0] - - -def get_open_write_kwargs( - dest: str | io.BytesIO, container_format: str, to_format: str | None -) -> dict: - """Get kwargs for writing a `VideoFromFile` to a file/stream with `av.open`""" - open_kwargs = { - "mode": "w", - # If isobmff, preserve custom metadata tags (workflow, prompt, extra_pnginfo) - "options": {"movflags": "use_metadata_tags"}, - } - - is_write_to_buffer = isinstance(dest, io.BytesIO) - if is_write_to_buffer: - # Set output format explicitly, since it cannot be inferred from file extension - if to_format == VideoContainer.AUTO: - to_format = container_format.lower() - elif isinstance(to_format, str): - to_format = to_format.lower() - open_kwargs["format"] = container_to_output_format(to_format) - - return open_kwargs - - -class VideoFromFile(VideoInput): - """ - Class representing video input from a file. - """ - - def __init__(self, file: str | io.BytesIO): - """ - Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object - containing the file contents. - """ - self.__file = file - - def get_stream_source(self) -> str | io.BytesIO: - """ - Return the underlying file source for efficient streaming. - This avoids unnecessary memory copies when the source is already a file path. - """ - if isinstance(self.__file, io.BytesIO): - self.__file.seek(0) - return self.__file - - def get_dimensions(self) -> tuple[int, int]: - """ - Returns the dimensions of the video input. - - Returns: - Tuple of (width, height) - """ - if isinstance(self.__file, io.BytesIO): - self.__file.seek(0) # Reset the BytesIO object to the beginning - with av.open(self.__file, mode='r') as container: - for stream in container.streams: - if stream.type == 'video': - assert isinstance(stream, av.VideoStream) - return stream.width, stream.height - raise ValueError(f"No video stream found in file '{self.__file}'") - - def get_duration(self) -> float: - """ - Returns the duration of the video in seconds. - - Returns: - Duration in seconds - """ - if isinstance(self.__file, io.BytesIO): - self.__file.seek(0) - with av.open(self.__file, mode="r") as container: - if container.duration is not None: - return float(container.duration / av.time_base) - - # Fallback: calculate from frame count and frame rate - video_stream = next( - (s for s in container.streams if s.type == "video"), None - ) - if video_stream and video_stream.frames and video_stream.average_rate: - return float(video_stream.frames / video_stream.average_rate) - - # Last resort: decode frames to count them - if video_stream and video_stream.average_rate: - frame_count = 0 - container.seek(0) - for packet in container.demux(video_stream): - for _ in packet.decode(): - frame_count += 1 - if frame_count > 0: - return float(frame_count / video_stream.average_rate) - - raise ValueError(f"Could not determine duration for file '{self.__file}'") - - def get_container_format(self) -> str: - """ - Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). - - Returns: - Container format as string - """ - if isinstance(self.__file, io.BytesIO): - self.__file.seek(0) - with av.open(self.__file, mode='r') as container: - return container.format.name - - def get_components_internal(self, container: InputContainer) -> VideoComponents: - # Get video frames - frames = [] - for frame in container.decode(video=0): - img = frame.to_ndarray(format='rgb24') # shape: (H, W, 3) - img = torch.from_numpy(img) / 255.0 # shape: (H, W, 3) - frames.append(img) - - images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0) - - # Get frame rate - video_stream = next(s for s in container.streams if s.type == 'video') - frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1) - - # Get audio if available - audio = None - try: - container.seek(0) # Reset the container to the beginning - for stream in container.streams: - if stream.type != 'audio': - continue - assert isinstance(stream, av.AudioStream) - audio_frames = [] - for packet in container.demux(stream): - for frame in packet.decode(): - assert isinstance(frame, av.AudioFrame) - audio_frames.append(frame.to_ndarray()) # shape: (channels, samples) - if len(audio_frames) > 0: - audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples) - audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples) - audio = AudioInput({ - "waveform": audio_tensor, - "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1, - }) - except StopIteration: - pass # No audio stream - - metadata = container.metadata - return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata) - - def get_components(self) -> VideoComponents: - if isinstance(self.__file, io.BytesIO): - self.__file.seek(0) # Reset the BytesIO object to the beginning - with av.open(self.__file, mode='r') as container: - return self.get_components_internal(container) - raise ValueError(f"No video stream found in file '{self.__file}'") - - def save_to( - self, - path: str | io.BytesIO, - format: VideoContainer = VideoContainer.AUTO, - codec: VideoCodec = VideoCodec.AUTO, - metadata: Optional[dict] = None - ): - if isinstance(self.__file, io.BytesIO): - self.__file.seek(0) # Reset the BytesIO object to the beginning - with av.open(self.__file, mode='r') as container: - container_format = container.format.name - video_encoding = container.streams.video[0].codec.name if len(container.streams.video) > 0 else None - reuse_streams = True - if format != VideoContainer.AUTO and format not in container_format.split(","): - reuse_streams = False - if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None: - reuse_streams = False - - if not reuse_streams: - components = self.get_components_internal(container) - video = VideoFromComponents(components) - return video.save_to( - path, - format=format, - codec=codec, - metadata=metadata - ) - - streams = container.streams - - open_kwargs = get_open_write_kwargs(path, container_format, format) - with av.open(path, **open_kwargs) as output_container: - # Copy over the original metadata - for key, value in container.metadata.items(): - if metadata is None or key not in metadata: - output_container.metadata[key] = value - - # Add our new metadata - if metadata is not None: - for key, value in metadata.items(): - if isinstance(value, str): - output_container.metadata[key] = value - else: - output_container.metadata[key] = json.dumps(value) - - # Add streams to the new container - stream_map = {} - for stream in streams: - if isinstance(stream, (av.VideoStream, av.AudioStream, SubtitleStream)): - out_stream = output_container.add_stream_from_template(template=stream, opaque=True) - stream_map[stream] = out_stream - - # Write packets to the new container - for packet in container.demux(): - if packet.stream in stream_map and packet.dts is not None: - packet.stream = stream_map[packet.stream] - output_container.mux(packet) - -class VideoFromComponents(VideoInput): - """ - Class representing video input from tensors. - """ - - def __init__(self, components: VideoComponents): - self.__components = components - - def get_components(self) -> VideoComponents: - return VideoComponents( - images=self.__components.images, - audio=self.__components.audio, - frame_rate=self.__components.frame_rate - ) - - def save_to( - self, - path: str, - format: VideoContainer = VideoContainer.AUTO, - codec: VideoCodec = VideoCodec.AUTO, - metadata: Optional[dict] = None - ): - if format != VideoContainer.AUTO and format != VideoContainer.MP4: - raise ValueError("Only MP4 format is supported for now") - if codec != VideoCodec.AUTO and codec != VideoCodec.H264: - raise ValueError("Only H264 codec is supported for now") - with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}) as output: - # Add metadata before writing any streams - if metadata is not None: - for key, value in metadata.items(): - output.metadata[key] = json.dumps(value) - - frame_rate = Fraction(round(self.__components.frame_rate * 1000), 1000) - # Create a video stream - video_stream = output.add_stream('h264', rate=frame_rate) - video_stream.width = self.__components.images.shape[2] - video_stream.height = self.__components.images.shape[1] - video_stream.pix_fmt = 'yuv420p' - - # Create an audio stream - audio_sample_rate = 1 - audio_stream: Optional[av.AudioStream] = None - if self.__components.audio: - audio_sample_rate = int(self.__components.audio['sample_rate']) - audio_stream = output.add_stream('aac', rate=audio_sample_rate) - audio_stream.sample_rate = audio_sample_rate - audio_stream.format = 'fltp' - - # Encode video - for i, frame in enumerate(self.__components.images): - img = (frame * 255).clamp(0, 255).byte().cpu().numpy() # shape: (H, W, 3) - frame = av.VideoFrame.from_ndarray(img, format='rgb24') - frame = frame.reformat(format='yuv420p') # Convert to YUV420P as required by h264 - packet = video_stream.encode(frame) - output.mux(packet) - - # Flush video - packet = video_stream.encode(None) - output.mux(packet) - - if audio_stream and self.__components.audio: - # Encode audio - samples_per_frame = int(audio_sample_rate / frame_rate) - num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame - for i in range(num_frames): - start = i * samples_per_frame - end = start + samples_per_frame - # TODO(Feature) - Add support for stereo audio - chunk = ( - self.__components.audio["waveform"][0, 0, start:end] - .unsqueeze(0) - .contiguous() - .numpy() - ) - audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono') - audio_frame.sample_rate = audio_sample_rate - audio_frame.pts = i * samples_per_frame - for packet in audio_stream.encode(audio_frame): - output.mux(packet) - - # Flush audio - for packet in audio_stream.encode(None): - output.mux(packet) - +# This file only exists for backwards compatibility. +from comfy_api.latest._input_impl.video_types import * # noqa: F403 diff --git a/comfy_api/internal/__init__.py b/comfy_api/internal/__init__.py new file mode 100644 index 000000000..c00b1fdbb --- /dev/null +++ b/comfy_api/internal/__init__.py @@ -0,0 +1,7 @@ +# Internal infrastructure for ComfyAPI +from .api_registry import ( + ComfyAPIBase as ComfyAPIBase, + ComfyAPIWithVersion as ComfyAPIWithVersion, + register_versions as register_versions, + get_all_versions as get_all_versions, +) diff --git a/comfy_api/internal/api_registry.py b/comfy_api/internal/api_registry.py new file mode 100644 index 000000000..7e3375cf6 --- /dev/null +++ b/comfy_api/internal/api_registry.py @@ -0,0 +1,39 @@ +from typing import Type, List, NamedTuple +from comfy_api.internal.singleton import ProxiedSingleton +from packaging import version as packaging_version + + +class ComfyAPIBase(ProxiedSingleton): + def __init__(self): + pass + + +class ComfyAPIWithVersion(NamedTuple): + version: str + api_class: Type[ComfyAPIBase] + + +def parse_version(version_str: str) -> packaging_version.Version: + """ + Parses a version string into a packaging_version.Version object. + Raises ValueError if the version string is invalid. + """ + if version_str == "latest": + return packaging_version.parse("9999999.9999999.9999999") + return packaging_version.parse(version_str) + + +registered_versions: List[ComfyAPIWithVersion] = [] + + +def register_versions(versions: List[ComfyAPIWithVersion]): + versions.sort(key=lambda x: parse_version(x.version)) + global registered_versions + registered_versions = versions + + +def get_all_versions() -> List[ComfyAPIWithVersion]: + """ + Returns a list of all registered ComfyAPI versions. + """ + return registered_versions diff --git a/comfy_api/internal/async_to_sync.py b/comfy_api/internal/async_to_sync.py new file mode 100644 index 000000000..f5f805a62 --- /dev/null +++ b/comfy_api/internal/async_to_sync.py @@ -0,0 +1,987 @@ +import asyncio +import concurrent.futures +import contextvars +import functools +import inspect +import logging +import os +import textwrap +import threading +from enum import Enum +from typing import Optional, Type, get_origin, get_args + + +class TypeTracker: + """Tracks types discovered during stub generation for automatic import generation.""" + + def __init__(self): + self.discovered_types = {} # type_name -> (module, qualname) + self.builtin_types = { + "Any", + "Dict", + "List", + "Optional", + "Tuple", + "Union", + "Set", + "Sequence", + "cast", + "NamedTuple", + "str", + "int", + "float", + "bool", + "None", + "bytes", + "object", + "type", + "dict", + "list", + "tuple", + "set", + } + self.already_imported = ( + set() + ) # Track types already imported to avoid duplicates + + def track_type(self, annotation): + """Track a type annotation and record its module/import info.""" + if annotation is None or annotation is type(None): + return + + # Skip builtins and typing module types we already import + type_name = getattr(annotation, "__name__", None) + if type_name and ( + type_name in self.builtin_types or type_name in self.already_imported + ): + return + + # Get module and qualname + module = getattr(annotation, "__module__", None) + qualname = getattr(annotation, "__qualname__", type_name or "") + + # Skip types from typing module (they're already imported) + if module == "typing": + return + + # Skip UnionType and GenericAlias from types module as they're handled specially + if module == "types" and type_name in ("UnionType", "GenericAlias"): + return + + if module and module not in ["builtins", "__main__"]: + # Store the type info + if type_name: + self.discovered_types[type_name] = (module, qualname) + + def get_imports(self, main_module_name: str) -> list[str]: + """Generate import statements for all discovered types.""" + imports = [] + imports_by_module = {} + + for type_name, (module, qualname) in sorted(self.discovered_types.items()): + # Skip types from the main module (they're already imported) + if main_module_name and module == main_module_name: + continue + + if module not in imports_by_module: + imports_by_module[module] = [] + if type_name not in imports_by_module[module]: # Avoid duplicates + imports_by_module[module].append(type_name) + + # Generate import statements + for module, types in sorted(imports_by_module.items()): + if len(types) == 1: + imports.append(f"from {module} import {types[0]}") + else: + imports.append(f"from {module} import {', '.join(sorted(set(types)))}") + + return imports + + +class AsyncToSyncConverter: + """ + Provides utilities to convert async classes to sync classes with proper type hints. + """ + + _thread_pool: Optional[concurrent.futures.ThreadPoolExecutor] = None + _thread_pool_lock = threading.Lock() + _thread_pool_initialized = False + + @classmethod + def get_thread_pool(cls, max_workers=None) -> concurrent.futures.ThreadPoolExecutor: + """Get or create the shared thread pool with proper thread-safe initialization.""" + # Fast path - check if already initialized without acquiring lock + if cls._thread_pool_initialized: + assert cls._thread_pool is not None, "Thread pool should be initialized" + return cls._thread_pool + + # Slow path - acquire lock and create pool if needed + with cls._thread_pool_lock: + if not cls._thread_pool_initialized: + cls._thread_pool = concurrent.futures.ThreadPoolExecutor( + max_workers=max_workers, thread_name_prefix="async_to_sync_" + ) + cls._thread_pool_initialized = True + + # This should never be None at this point, but add assertion for type checker + assert cls._thread_pool is not None + return cls._thread_pool + + @classmethod + def run_async_in_thread(cls, coro_func, *args, **kwargs): + """ + Run an async function in a separate thread from the thread pool. + Blocks until the async function completes. + Properly propagates contextvars between threads and manages event loops. + """ + # Capture current context - this includes all context variables + context = contextvars.copy_context() + + # Store the result and any exception that occurs + result_container: dict = {"result": None, "exception": None} + + # Function that runs in the thread pool + def run_in_thread(): + # Create new event loop for this thread + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + # Create the coroutine within the context + async def run_with_context(): + # The coroutine function might access context variables + return await coro_func(*args, **kwargs) + + # Run the coroutine with the captured context + # This ensures all context variables are available in the async function + result = context.run(loop.run_until_complete, run_with_context()) + result_container["result"] = result + except Exception as e: + # Store the exception to re-raise in the calling thread + result_container["exception"] = e + finally: + # Ensure event loop is properly closed to prevent warnings + try: + # Cancel any remaining tasks + pending = asyncio.all_tasks(loop) + for task in pending: + task.cancel() + + # Run the loop briefly to handle cancellations + if pending: + loop.run_until_complete( + asyncio.gather(*pending, return_exceptions=True) + ) + except Exception: + pass # Ignore errors during cleanup + + # Close the event loop + loop.close() + + # Clear the event loop from the thread + asyncio.set_event_loop(None) + + # Submit to thread pool and wait for result + thread_pool = cls.get_thread_pool() + future = thread_pool.submit(run_in_thread) + future.result() # Wait for completion + + # Re-raise any exception that occurred in the thread + if result_container["exception"] is not None: + raise result_container["exception"] + + return result_container["result"] + + @classmethod + def create_sync_class(cls, async_class: Type, thread_pool_size=10) -> Type: + """ + Creates a new class with synchronous versions of all async methods. + + Args: + async_class: The async class to convert + thread_pool_size: Size of thread pool to use + + Returns: + A new class with sync versions of all async methods + """ + sync_class_name = "ComfyAPISyncStub" + cls.get_thread_pool(thread_pool_size) + + # Create a proper class with docstrings and proper base classes + sync_class_dict = { + "__doc__": async_class.__doc__, + "__module__": async_class.__module__, + "__qualname__": sync_class_name, + "__orig_class__": async_class, # Store original class for typing references + } + + # Create __init__ method + def __init__(self, *args, **kwargs): + self._async_instance = async_class(*args, **kwargs) + + # Handle annotated class attributes (like execution: Execution) + # Get all annotations from the class hierarchy + all_annotations = {} + for base_class in reversed(inspect.getmro(async_class)): + if hasattr(base_class, "__annotations__"): + all_annotations.update(base_class.__annotations__) + + # For each annotated attribute, check if it needs to be created or wrapped + for attr_name, attr_type in all_annotations.items(): + if hasattr(self._async_instance, attr_name): + # Attribute exists on the instance + attr = getattr(self._async_instance, attr_name) + # Check if this attribute needs a sync wrapper + if hasattr(attr, "__class__"): + from comfy_api.internal.singleton import ProxiedSingleton + + if isinstance(attr, ProxiedSingleton): + # Create a sync version of this attribute + try: + sync_attr_class = cls.create_sync_class(attr.__class__) + # Create instance of the sync wrapper with the async instance + sync_attr = object.__new__(sync_attr_class) # type: ignore + sync_attr._async_instance = attr + setattr(self, attr_name, sync_attr) + except Exception: + # If we can't create a sync version, keep the original + setattr(self, attr_name, attr) + else: + # Not async, just copy the reference + setattr(self, attr_name, attr) + else: + # Attribute doesn't exist, but is annotated - create it + # This handles cases like execution: Execution + if isinstance(attr_type, type): + # Check if the type is defined as an inner class + if hasattr(async_class, attr_type.__name__): + inner_class = getattr(async_class, attr_type.__name__) + from comfy_api.internal.singleton import ProxiedSingleton + + # Create an instance of the inner class + try: + # For ProxiedSingleton classes, get or create the singleton instance + if issubclass(inner_class, ProxiedSingleton): + async_instance = inner_class.get_instance() + else: + async_instance = inner_class() + + # Create sync wrapper + sync_attr_class = cls.create_sync_class(inner_class) + sync_attr = object.__new__(sync_attr_class) # type: ignore + sync_attr._async_instance = async_instance + setattr(self, attr_name, sync_attr) + # Also set on the async instance for consistency + setattr(self._async_instance, attr_name, async_instance) + except Exception as e: + logging.warning( + f"Failed to create instance for {attr_name}: {e}" + ) + + # Handle other instance attributes that might not be annotated + for name, attr in inspect.getmembers(self._async_instance): + if name.startswith("_") or hasattr(self, name): + continue + + # If attribute is an instance of a class, and that class is defined in the original class + # we need to check if it needs a sync wrapper + if isinstance(attr, object) and not isinstance( + attr, (str, int, float, bool, list, dict, tuple) + ): + from comfy_api.internal.singleton import ProxiedSingleton + + if isinstance(attr, ProxiedSingleton): + # Create a sync version of this nested class + try: + sync_attr_class = cls.create_sync_class(attr.__class__) + # Create instance of the sync wrapper with the async instance + sync_attr = object.__new__(sync_attr_class) # type: ignore + sync_attr._async_instance = attr + setattr(self, name, sync_attr) + except Exception: + # If we can't create a sync version, keep the original + setattr(self, name, attr) + + sync_class_dict["__init__"] = __init__ + + # Process methods from the async class + for name, method in inspect.getmembers( + async_class, predicate=inspect.isfunction + ): + if name.startswith("_"): + continue + + # Extract the actual return type from a coroutine + if inspect.iscoroutinefunction(method): + # Create sync version of async method with proper signature + @functools.wraps(method) + def sync_method(self, *args, _method_name=name, **kwargs): + async_method = getattr(self._async_instance, _method_name) + return AsyncToSyncConverter.run_async_in_thread( + async_method, *args, **kwargs + ) + + # Add to the class dict + sync_class_dict[name] = sync_method + else: + # For regular methods, create a proxy method + @functools.wraps(method) + def proxy_method(self, *args, _method_name=name, **kwargs): + method = getattr(self._async_instance, _method_name) + return method(*args, **kwargs) + + # Add to the class dict + sync_class_dict[name] = proxy_method + + # Handle property access + for name, prop in inspect.getmembers( + async_class, lambda x: isinstance(x, property) + ): + + def make_property(name, prop_obj): + def getter(self): + value = getattr(self._async_instance, name) + if inspect.iscoroutinefunction(value): + + def sync_fn(*args, **kwargs): + return AsyncToSyncConverter.run_async_in_thread( + value, *args, **kwargs + ) + + return sync_fn + return value + + def setter(self, value): + setattr(self._async_instance, name, value) + + return property(getter, setter if prop_obj.fset else None) + + sync_class_dict[name] = make_property(name, prop) + + # Create the class + sync_class = type(sync_class_name, (object,), sync_class_dict) + + return sync_class + + @classmethod + def _format_type_annotation( + cls, annotation, type_tracker: Optional[TypeTracker] = None + ) -> str: + """Convert a type annotation to its string representation for stub files.""" + if ( + annotation is inspect.Parameter.empty + or annotation is inspect.Signature.empty + ): + return "Any" + + # Handle None type + if annotation is type(None): + return "None" + + # Track the type if we have a tracker + if type_tracker: + type_tracker.track_type(annotation) + + # Try using typing.get_origin/get_args for Python 3.8+ + try: + origin = get_origin(annotation) + args = get_args(annotation) + + if origin is not None: + # Track the origin type + if type_tracker: + type_tracker.track_type(origin) + + # Get the origin name + origin_name = getattr(origin, "__name__", str(origin)) + if "." in origin_name: + origin_name = origin_name.split(".")[-1] + + # Special handling for types.UnionType (Python 3.10+ pipe operator) + # Convert to old-style Union for compatibility + if str(origin) == "" or origin_name == "UnionType": + origin_name = "Union" + + # Format arguments recursively + if args: + formatted_args = [] + for arg in args: + # Track each type in the union + if type_tracker: + type_tracker.track_type(arg) + formatted_args.append(cls._format_type_annotation(arg, type_tracker)) + return f"{origin_name}[{', '.join(formatted_args)}]" + else: + return origin_name + except (AttributeError, TypeError): + # Fallback for older Python versions or non-generic types + pass + + # Handle generic types the old way for compatibility + if hasattr(annotation, "__origin__") and hasattr(annotation, "__args__"): + origin = annotation.__origin__ + origin_name = ( + origin.__name__ + if hasattr(origin, "__name__") + else str(origin).split("'")[1] + ) + + # Format each type argument + args = [] + for arg in annotation.__args__: + args.append(cls._format_type_annotation(arg, type_tracker)) + + return f"{origin_name}[{', '.join(args)}]" + + # Handle regular types with __name__ + if hasattr(annotation, "__name__"): + return annotation.__name__ + + # Handle special module types (like types from typing module) + if hasattr(annotation, "__module__") and hasattr(annotation, "__qualname__"): + # For types like typing.Literal, typing.TypedDict, etc. + return annotation.__qualname__ + + # Last resort: string conversion with cleanup + type_str = str(annotation) + + # Clean up common patterns more robustly + if type_str.startswith(""): + type_str = type_str[8:-2] # Remove "" + + # Remove module prefixes for common modules + for prefix in ["typing.", "builtins.", "types."]: + if type_str.startswith(prefix): + type_str = type_str[len(prefix) :] + + # Handle special cases + if type_str in ("_empty", "inspect._empty"): + return "None" + + # Fix NoneType (this should rarely be needed now) + if type_str == "NoneType": + return "None" + + return type_str + + @classmethod + def _extract_coroutine_return_type(cls, annotation): + """Extract the actual return type from a Coroutine annotation.""" + if hasattr(annotation, "__args__") and len(annotation.__args__) > 2: + # Coroutine[Any, Any, ReturnType] -> extract ReturnType + return annotation.__args__[2] + return annotation + + @classmethod + def _format_parameter_default(cls, default_value) -> str: + """Format a parameter's default value for stub files.""" + if default_value is inspect.Parameter.empty: + return "" + elif default_value is None: + return " = None" + elif isinstance(default_value, bool): + return f" = {default_value}" + elif default_value == {}: + return " = {}" + elif default_value == []: + return " = []" + else: + return f" = {default_value}" + + @classmethod + def _format_method_parameters( + cls, + sig: inspect.Signature, + skip_self: bool = True, + type_hints: Optional[dict] = None, + type_tracker: Optional[TypeTracker] = None, + ) -> str: + """Format method parameters for stub files.""" + params = [] + if type_hints is None: + type_hints = {} + + for i, (param_name, param) in enumerate(sig.parameters.items()): + if i == 0 and param_name == "self" and skip_self: + params.append("self") + else: + # Get type annotation from type hints if available, otherwise from signature + annotation = type_hints.get(param_name, param.annotation) + type_str = cls._format_type_annotation(annotation, type_tracker) + + # Get default value + default_str = cls._format_parameter_default(param.default) + + # Combine parameter parts + if annotation is inspect.Parameter.empty: + params.append(f"{param_name}: Any{default_str}") + else: + params.append(f"{param_name}: {type_str}{default_str}") + + return ", ".join(params) + + @classmethod + def _generate_method_signature( + cls, + method_name: str, + method, + is_async: bool = False, + type_tracker: Optional[TypeTracker] = None, + ) -> str: + """Generate a complete method signature for stub files.""" + sig = inspect.signature(method) + + # Try to get evaluated type hints to resolve string annotations + try: + from typing import get_type_hints + type_hints = get_type_hints(method) + except Exception: + # Fallback to empty dict if we can't get type hints + type_hints = {} + + # For async methods, extract the actual return type + return_annotation = type_hints.get('return', sig.return_annotation) + if is_async and inspect.iscoroutinefunction(method): + return_annotation = cls._extract_coroutine_return_type(return_annotation) + + # Format parameters with type hints + params_str = cls._format_method_parameters(sig, type_hints=type_hints, type_tracker=type_tracker) + + # Format return type + return_type = cls._format_type_annotation(return_annotation, type_tracker) + if return_annotation is inspect.Signature.empty: + return_type = "None" + + return f"def {method_name}({params_str}) -> {return_type}: ..." + + @classmethod + def _generate_imports( + cls, async_class: Type, type_tracker: TypeTracker + ) -> list[str]: + """Generate import statements for the stub file.""" + imports = [] + + # Add standard typing imports + imports.append( + "from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple" + ) + + # Add imports from the original module + if async_class.__module__ != "builtins": + module = inspect.getmodule(async_class) + additional_types = [] + + if module: + # Check if module has __all__ defined + module_all = getattr(module, "__all__", None) + + for name, obj in sorted(inspect.getmembers(module)): + if isinstance(obj, type): + # Skip if __all__ is defined and this name isn't in it + # unless it's already been tracked as used in type annotations + if module_all is not None and name not in module_all: + # Check if this type was actually used in annotations + if name not in type_tracker.discovered_types: + continue + + # Check for NamedTuple + if issubclass(obj, tuple) and hasattr(obj, "_fields"): + additional_types.append(name) + # Mark as already imported + type_tracker.already_imported.add(name) + # Check for Enum + elif issubclass(obj, Enum) and name != "Enum": + additional_types.append(name) + # Mark as already imported + type_tracker.already_imported.add(name) + + if additional_types: + type_imports = ", ".join([async_class.__name__] + additional_types) + imports.append(f"from {async_class.__module__} import {type_imports}") + else: + imports.append( + f"from {async_class.__module__} import {async_class.__name__}" + ) + + # Add imports for all discovered types + # Pass the main module name to avoid duplicate imports + imports.extend( + type_tracker.get_imports(main_module_name=async_class.__module__) + ) + + # Add base module import if needed + if hasattr(inspect.getmodule(async_class), "__name__"): + module_name = inspect.getmodule(async_class).__name__ + if "." in module_name: + base_module = module_name.split(".")[0] + # Only add if not already importing from it + if not any(imp.startswith(f"from {base_module}") for imp in imports): + imports.append(f"import {base_module}") + + return imports + + @classmethod + def _get_class_attributes(cls, async_class: Type) -> list[tuple[str, Type]]: + """Extract class attributes that are classes themselves.""" + class_attributes = [] + + # Look for class attributes that are classes + for name, attr in sorted(inspect.getmembers(async_class)): + if isinstance(attr, type) and not name.startswith("_"): + class_attributes.append((name, attr)) + elif ( + hasattr(async_class, "__annotations__") + and name in async_class.__annotations__ + ): + annotation = async_class.__annotations__[name] + if isinstance(annotation, type): + class_attributes.append((name, annotation)) + + return class_attributes + + @classmethod + def _generate_inner_class_stub( + cls, + name: str, + attr: Type, + indent: str = " ", + type_tracker: Optional[TypeTracker] = None, + ) -> list[str]: + """Generate stub for an inner class.""" + stub_lines = [] + stub_lines.append(f"{indent}class {name}Sync:") + + # Add docstring if available + if hasattr(attr, "__doc__") and attr.__doc__: + stub_lines.extend( + cls._format_docstring_for_stub(attr.__doc__, f"{indent} ") + ) + + # Add __init__ if it exists + if hasattr(attr, "__init__"): + try: + init_method = getattr(attr, "__init__") + init_sig = inspect.signature(init_method) + + # Try to get type hints + try: + from typing import get_type_hints + init_hints = get_type_hints(init_method) + except Exception: + init_hints = {} + + # Format parameters + params_str = cls._format_method_parameters( + init_sig, type_hints=init_hints, type_tracker=type_tracker + ) + # Add __init__ docstring if available (before the method) + if hasattr(init_method, "__doc__") and init_method.__doc__: + stub_lines.extend( + cls._format_docstring_for_stub( + init_method.__doc__, f"{indent} " + ) + ) + stub_lines.append( + f"{indent} def __init__({params_str}) -> None: ..." + ) + except (ValueError, TypeError): + stub_lines.append( + f"{indent} def __init__(self, *args, **kwargs) -> None: ..." + ) + + # Add methods to the inner class + has_methods = False + for method_name, method in sorted( + inspect.getmembers(attr, predicate=inspect.isfunction) + ): + if method_name.startswith("_"): + continue + + has_methods = True + try: + # Add method docstring if available (before the method signature) + if method.__doc__: + stub_lines.extend( + cls._format_docstring_for_stub(method.__doc__, f"{indent} ") + ) + + method_sig = cls._generate_method_signature( + method_name, method, is_async=True, type_tracker=type_tracker + ) + stub_lines.append(f"{indent} {method_sig}") + except (ValueError, TypeError): + stub_lines.append( + f"{indent} def {method_name}(self, *args, **kwargs): ..." + ) + + if not has_methods: + stub_lines.append(f"{indent} pass") + + return stub_lines + + @classmethod + def _format_docstring_for_stub( + cls, docstring: str, indent: str = " " + ) -> list[str]: + """Format a docstring for inclusion in a stub file with proper indentation.""" + if not docstring: + return [] + + # First, dedent the docstring to remove any existing indentation + dedented = textwrap.dedent(docstring).strip() + + # Split into lines + lines = dedented.split("\n") + + # Build the properly indented docstring + result = [] + result.append(f'{indent}"""') + + for line in lines: + if line.strip(): # Non-empty line + result.append(f"{indent}{line}") + else: # Empty line + result.append("") + + result.append(f'{indent}"""') + return result + + @classmethod + def _post_process_stub_content(cls, stub_content: list[str]) -> list[str]: + """Post-process stub content to fix any remaining issues.""" + processed = [] + + for line in stub_content: + # Skip processing imports + if line.startswith(("from ", "import ")): + processed.append(line) + continue + + # Fix method signatures missing return types + if ( + line.strip().startswith("def ") + and line.strip().endswith(": ...") + and ") -> " not in line + ): + # Add -> None for methods without return annotation + line = line.replace(": ...", " -> None: ...") + + processed.append(line) + + return processed + + @classmethod + def generate_stub_file(cls, async_class: Type, sync_class: Type) -> None: + """ + Generate a .pyi stub file for the sync class to help IDEs with type checking. + """ + try: + # Only generate stub if we can determine module path + if async_class.__module__ == "__main__": + return + + module = inspect.getmodule(async_class) + if not module: + return + + module_path = module.__file__ + if not module_path: + return + + # Create stub file path in a 'generated' subdirectory + module_dir = os.path.dirname(module_path) + stub_dir = os.path.join(module_dir, "generated") + + # Ensure the generated directory exists + os.makedirs(stub_dir, exist_ok=True) + + module_name = os.path.basename(module_path) + if module_name.endswith(".py"): + module_name = module_name[:-3] + + sync_stub_path = os.path.join(stub_dir, f"{sync_class.__name__}.pyi") + + # Create a type tracker for this stub generation + type_tracker = TypeTracker() + + stub_content = [] + + # We'll generate imports after processing all methods to capture all types + # Leave a placeholder for imports + imports_placeholder_index = len(stub_content) + stub_content.append("") # Will be replaced with imports later + + # Class definition + stub_content.append(f"class {sync_class.__name__}:") + + # Docstring + if async_class.__doc__: + stub_content.extend( + cls._format_docstring_for_stub(async_class.__doc__, " ") + ) + + # Generate __init__ + try: + init_method = async_class.__init__ + init_signature = inspect.signature(init_method) + + # Try to get type hints for __init__ + try: + from typing import get_type_hints + init_hints = get_type_hints(init_method) + except Exception: + init_hints = {} + + # Format parameters + params_str = cls._format_method_parameters( + init_signature, type_hints=init_hints, type_tracker=type_tracker + ) + # Add __init__ docstring if available (before the method) + if hasattr(init_method, "__doc__") and init_method.__doc__: + stub_content.extend( + cls._format_docstring_for_stub(init_method.__doc__, " ") + ) + stub_content.append(f" def __init__({params_str}) -> None: ...") + except (ValueError, TypeError): + stub_content.append( + " def __init__(self, *args, **kwargs) -> None: ..." + ) + + stub_content.append("") # Add newline after __init__ + + # Get class attributes + class_attributes = cls._get_class_attributes(async_class) + + # Generate inner classes + for name, attr in class_attributes: + inner_class_stub = cls._generate_inner_class_stub( + name, attr, type_tracker=type_tracker + ) + stub_content.extend(inner_class_stub) + stub_content.append("") # Add newline after the inner class + + # Add methods to the main class + processed_methods = set() # Keep track of methods we've processed + for name, method in sorted( + inspect.getmembers(async_class, predicate=inspect.isfunction) + ): + if name.startswith("_") or name in processed_methods: + continue + + processed_methods.add(name) + + try: + method_sig = cls._generate_method_signature( + name, method, is_async=True, type_tracker=type_tracker + ) + + # Add docstring if available (before the method signature for proper formatting) + if method.__doc__: + stub_content.extend( + cls._format_docstring_for_stub(method.__doc__, " ") + ) + + stub_content.append(f" {method_sig}") + + stub_content.append("") # Add newline after each method + + except (ValueError, TypeError): + # If we can't get the signature, just add a simple stub + stub_content.append(f" def {name}(self, *args, **kwargs): ...") + stub_content.append("") # Add newline + + # Add properties + for name, prop in sorted( + inspect.getmembers(async_class, lambda x: isinstance(x, property)) + ): + stub_content.append(" @property") + stub_content.append(f" def {name}(self) -> Any: ...") + if prop.fset: + stub_content.append(f" @{name}.setter") + stub_content.append( + f" def {name}(self, value: Any) -> None: ..." + ) + stub_content.append("") # Add newline after each property + + # Add placeholders for the nested class instances + # Check the actual attribute names from class annotations and attributes + attribute_mappings = {} + + # First check annotations for typed attributes (including from parent classes) + # Collect all annotations from the class hierarchy + all_annotations = {} + for base_class in reversed(inspect.getmro(async_class)): + if hasattr(base_class, "__annotations__"): + all_annotations.update(base_class.__annotations__) + + for attr_name, attr_type in sorted(all_annotations.items()): + for class_name, class_type in class_attributes: + # If the class type matches the annotated type + if ( + attr_type == class_type + or (hasattr(attr_type, "__name__") and attr_type.__name__ == class_name) + or (isinstance(attr_type, str) and attr_type == class_name) + ): + attribute_mappings[class_name] = attr_name + + # Remove the extra checking - annotations should be sufficient + + # Add the attribute declarations with proper names + for class_name, class_type in class_attributes: + # Check if there's a mapping from annotation + attr_name = attribute_mappings.get(class_name, class_name) + # Use the annotation name if it exists, even if the attribute doesn't exist yet + # This is because the attribute might be created at runtime + stub_content.append(f" {attr_name}: {class_name}Sync") + + stub_content.append("") # Add a final newline + + # Now generate imports with all discovered types + imports = cls._generate_imports(async_class, type_tracker) + + # Deduplicate imports while preserving order + seen = set() + unique_imports = [] + for imp in imports: + if imp not in seen: + seen.add(imp) + unique_imports.append(imp) + else: + logging.warning(f"Duplicate import detected: {imp}") + + # Replace the placeholder with actual imports + stub_content[imports_placeholder_index : imports_placeholder_index + 1] = ( + unique_imports + ) + + # Post-process stub content + stub_content = cls._post_process_stub_content(stub_content) + + # Write stub file + with open(sync_stub_path, "w") as f: + f.write("\n".join(stub_content)) + + logging.info(f"Generated stub file: {sync_stub_path}") + + except Exception as e: + # If stub generation fails, log the error but don't break the main functionality + logging.error( + f"Error generating stub file for {sync_class.__name__}: {str(e)}" + ) + import traceback + + logging.error(traceback.format_exc()) + + +def create_sync_class(async_class: Type, thread_pool_size=10) -> Type: + """ + Creates a sync version of an async class + + Args: + async_class: The async class to convert + thread_pool_size: Size of thread pool to use + + Returns: + A new class with sync versions of all async methods + """ + return AsyncToSyncConverter.create_sync_class(async_class, thread_pool_size) diff --git a/comfy_api/internal/singleton.py b/comfy_api/internal/singleton.py new file mode 100644 index 000000000..75f16f98e --- /dev/null +++ b/comfy_api/internal/singleton.py @@ -0,0 +1,33 @@ +from typing import Type, TypeVar + +class SingletonMetaclass(type): + T = TypeVar("T", bound="SingletonMetaclass") + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(SingletonMetaclass, cls).__call__( + *args, **kwargs + ) + return cls._instances[cls] + + def inject_instance(cls: Type[T], instance: T) -> None: + assert cls not in SingletonMetaclass._instances, ( + "Cannot inject instance after first instantiation" + ) + SingletonMetaclass._instances[cls] = instance + + def get_instance(cls: Type[T], *args, **kwargs) -> T: + """ + Gets the singleton instance of the class, creating it if it doesn't exist. + """ + if cls not in SingletonMetaclass._instances: + SingletonMetaclass._instances[cls] = super( + SingletonMetaclass, cls + ).__call__(*args, **kwargs) + return cls._instances[cls] + + +class ProxiedSingleton(object, metaclass=SingletonMetaclass): + def __init__(self): + super().__init__() diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py new file mode 100644 index 000000000..e1f3a3655 --- /dev/null +++ b/comfy_api/latest/__init__.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +from typing import Type, TYPE_CHECKING +from comfy_api.internal import ComfyAPIBase +from comfy_api.internal.singleton import ProxiedSingleton +from comfy_api.internal.async_to_sync import create_sync_class +from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput +from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents +from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents +from comfy_execution.utils import get_executing_context +from comfy_execution.progress import get_progress_state, PreviewImageTuple +from PIL import Image +from comfy.cli_args import args +import numpy as np + + +class ComfyAPI_latest(ComfyAPIBase): + VERSION = "latest" + STABLE = False + + class Execution(ProxiedSingleton): + async def set_progress( + self, + value: float, + max_value: float, + node_id: str | None = None, + preview_image: Image.Image | ImageInput | None = None, + ignore_size_limit: bool = False, + ) -> None: + """ + Update the progress bar displayed in the ComfyUI interface. + + This function allows custom nodes and API calls to report their progress + back to the user interface, providing visual feedback during long operations. + + Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK + """ + executing_context = get_executing_context() + if node_id is None and executing_context is not None: + node_id = executing_context.node_id + if node_id is None: + raise ValueError("node_id must be provided if not in executing context") + + # Convert preview_image to PreviewImageTuple if needed + to_display: PreviewImageTuple | Image.Image | ImageInput | None = preview_image + if to_display is not None: + # First convert to PIL Image if needed + if isinstance(to_display, ImageInput): + # Convert ImageInput (torch.Tensor) to PIL Image + # Handle tensor shape [B, H, W, C] -> get first image if batch + tensor = to_display + if len(tensor.shape) == 4: + tensor = tensor[0] + + # Convert to numpy array and scale to 0-255 + image_np = (tensor.cpu().numpy() * 255).astype(np.uint8) + to_display = Image.fromarray(image_np) + + if isinstance(to_display, Image.Image): + # Detect image format from PIL Image + image_format = to_display.format if to_display.format else "JPEG" + # Use None for preview_size if ignore_size_limit is True + preview_size = None if ignore_size_limit else args.preview_size + to_display = (image_format, to_display, preview_size) + + get_progress_state().update_progress( + node_id=node_id, + value=value, + max_value=max_value, + image=to_display, + ) + + execution: Execution + +class Input: + Image = ImageInput + Audio = AudioInput + Mask = MaskInput + Latent = LatentInput + Video = VideoInput + +class InputImpl: + VideoFromFile = VideoFromFile + VideoFromComponents = VideoFromComponents + +class Types: + VideoCodec = VideoCodec + VideoContainer = VideoContainer + VideoComponents = VideoComponents + +ComfyAPI = ComfyAPI_latest + +# Create a synchronous version of the API +if TYPE_CHECKING: + import comfy_api.latest.generated.ComfyAPISyncStub # type: ignore + + ComfyAPISync: Type[comfy_api.latest.generated.ComfyAPISyncStub.ComfyAPISyncStub] +ComfyAPISync = create_sync_class(ComfyAPI_latest) + +__all__ = [ + "ComfyAPI", + "ComfyAPISync", + "Input", + "InputImpl", + "Types", +] diff --git a/comfy_api/latest/_input/__init__.py b/comfy_api/latest/_input/__init__.py new file mode 100644 index 000000000..14f0e72f4 --- /dev/null +++ b/comfy_api/latest/_input/__init__.py @@ -0,0 +1,10 @@ +from .basic_types import ImageInput, AudioInput, MaskInput, LatentInput +from .video_types import VideoInput + +__all__ = [ + "ImageInput", + "AudioInput", + "VideoInput", + "MaskInput", + "LatentInput", +] diff --git a/comfy_api/latest/_input/basic_types.py b/comfy_api/latest/_input/basic_types.py new file mode 100644 index 000000000..245c6cbb1 --- /dev/null +++ b/comfy_api/latest/_input/basic_types.py @@ -0,0 +1,42 @@ +import torch +from typing import TypedDict, List, Optional + +ImageInput = torch.Tensor +""" +An image in format [B, H, W, C] where B is the batch size, C is the number of channels, +""" + +MaskInput = torch.Tensor +""" +A mask in format [B, H, W] where B is the batch size +""" + +class AudioInput(TypedDict): + """ + TypedDict representing audio input. + """ + + waveform: torch.Tensor + """ + Tensor in the format [B, C, T] where B is the batch size, C is the number of channels, + """ + + sample_rate: int + +class LatentInput(TypedDict): + """ + TypedDict representing latent input. + """ + + samples: torch.Tensor + """ + Tensor in the format [B, C, H, W] where B is the batch size, C is the number of channels, + H is the height, and W is the width. + """ + + noise_mask: Optional[MaskInput] + """ + Optional noise mask tensor in the same format as samples. + """ + + batch_index: Optional[List[int]] diff --git a/comfy_api/latest/_input/video_types.py b/comfy_api/latest/_input/video_types.py new file mode 100644 index 000000000..5d95dc507 --- /dev/null +++ b/comfy_api/latest/_input/video_types.py @@ -0,0 +1,85 @@ +from __future__ import annotations +from abc import ABC, abstractmethod +from typing import Optional, Union +import io +import av +from comfy_api.util import VideoContainer, VideoCodec, VideoComponents + +class VideoInput(ABC): + """ + Abstract base class for video input types. + """ + + @abstractmethod + def get_components(self) -> VideoComponents: + """ + Abstract method to get the video components (images, audio, and frame rate). + + Returns: + VideoComponents containing images, audio, and frame rate + """ + pass + + @abstractmethod + def save_to( + self, + path: str, + format: VideoContainer = VideoContainer.AUTO, + codec: VideoCodec = VideoCodec.AUTO, + metadata: Optional[dict] = None + ): + """ + Abstract method to save the video input to a file. + """ + pass + + def get_stream_source(self) -> Union[str, io.BytesIO]: + """ + Get a streamable source for the video. This allows processing without + loading the entire video into memory. + + Returns: + Either a file path (str) or a BytesIO object that can be opened with av. + + Default implementation creates a BytesIO buffer, but subclasses should + override this for better performance when possible. + """ + buffer = io.BytesIO() + self.save_to(buffer) + buffer.seek(0) + return buffer + + # Provide a default implementation, but subclasses can provide optimized versions + # if possible. + def get_dimensions(self) -> tuple[int, int]: + """ + Returns the dimensions of the video input. + + Returns: + Tuple of (width, height) + """ + components = self.get_components() + return components.images.shape[2], components.images.shape[1] + + def get_duration(self) -> float: + """ + Returns the duration of the video in seconds. + + Returns: + Duration in seconds + """ + components = self.get_components() + frame_count = components.images.shape[0] + return float(frame_count / components.frame_rate) + + def get_container_format(self) -> str: + """ + Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). + + Returns: + Container format as string + """ + # Default implementation - subclasses should override for better performance + source = self.get_stream_source() + with av.open(source, mode="r") as container: + return container.format.name diff --git a/comfy_api/latest/_input_impl/__init__.py b/comfy_api/latest/_input_impl/__init__.py new file mode 100644 index 000000000..02901b8b9 --- /dev/null +++ b/comfy_api/latest/_input_impl/__init__.py @@ -0,0 +1,7 @@ +from .video_types import VideoFromFile, VideoFromComponents + +__all__ = [ + # Implementations + "VideoFromFile", + "VideoFromComponents", +] diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py new file mode 100644 index 000000000..28de9651d --- /dev/null +++ b/comfy_api/latest/_input_impl/video_types.py @@ -0,0 +1,324 @@ +from __future__ import annotations +from av.container import InputContainer +from av.subtitles.stream import SubtitleStream +from fractions import Fraction +from typing import Optional +from comfy_api.latest._input import AudioInput, VideoInput +import av +import io +import json +import numpy as np +import torch +from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents + + +def container_to_output_format(container_format: str | None) -> str | None: + """ + A container's `format` may be a comma-separated list of formats. + E.g., iso container's `format` may be `mov,mp4,m4a,3gp,3g2,mj2`. + However, writing to a file/stream with `av.open` requires a single format, + or `None` to auto-detect. + """ + if not container_format: + return None # Auto-detect + + if "," not in container_format: + return container_format + + formats = container_format.split(",") + return formats[0] + + +def get_open_write_kwargs( + dest: str | io.BytesIO, container_format: str, to_format: str | None +) -> dict: + """Get kwargs for writing a `VideoFromFile` to a file/stream with `av.open`""" + open_kwargs = { + "mode": "w", + # If isobmff, preserve custom metadata tags (workflow, prompt, extra_pnginfo) + "options": {"movflags": "use_metadata_tags"}, + } + + is_write_to_buffer = isinstance(dest, io.BytesIO) + if is_write_to_buffer: + # Set output format explicitly, since it cannot be inferred from file extension + if to_format == VideoContainer.AUTO: + to_format = container_format.lower() + elif isinstance(to_format, str): + to_format = to_format.lower() + open_kwargs["format"] = container_to_output_format(to_format) + + return open_kwargs + + +class VideoFromFile(VideoInput): + """ + Class representing video input from a file. + """ + + def __init__(self, file: str | io.BytesIO): + """ + Initialize the VideoFromFile object based off of either a path on disk or a BytesIO object + containing the file contents. + """ + self.__file = file + + def get_stream_source(self) -> str | io.BytesIO: + """ + Return the underlying file source for efficient streaming. + This avoids unnecessary memory copies when the source is already a file path. + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) + return self.__file + + def get_dimensions(self) -> tuple[int, int]: + """ + Returns the dimensions of the video input. + + Returns: + Tuple of (width, height) + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) # Reset the BytesIO object to the beginning + with av.open(self.__file, mode='r') as container: + for stream in container.streams: + if stream.type == 'video': + assert isinstance(stream, av.VideoStream) + return stream.width, stream.height + raise ValueError(f"No video stream found in file '{self.__file}'") + + def get_duration(self) -> float: + """ + Returns the duration of the video in seconds. + + Returns: + Duration in seconds + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) + with av.open(self.__file, mode="r") as container: + if container.duration is not None: + return float(container.duration / av.time_base) + + # Fallback: calculate from frame count and frame rate + video_stream = next( + (s for s in container.streams if s.type == "video"), None + ) + if video_stream and video_stream.frames and video_stream.average_rate: + return float(video_stream.frames / video_stream.average_rate) + + # Last resort: decode frames to count them + if video_stream and video_stream.average_rate: + frame_count = 0 + container.seek(0) + for packet in container.demux(video_stream): + for _ in packet.decode(): + frame_count += 1 + if frame_count > 0: + return float(frame_count / video_stream.average_rate) + + raise ValueError(f"Could not determine duration for file '{self.__file}'") + + def get_container_format(self) -> str: + """ + Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). + + Returns: + Container format as string + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) + with av.open(self.__file, mode='r') as container: + return container.format.name + + def get_components_internal(self, container: InputContainer) -> VideoComponents: + # Get video frames + frames = [] + for frame in container.decode(video=0): + img = frame.to_ndarray(format='rgb24') # shape: (H, W, 3) + img = torch.from_numpy(img) / 255.0 # shape: (H, W, 3) + frames.append(img) + + images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0) + + # Get frame rate + video_stream = next(s for s in container.streams if s.type == 'video') + frame_rate = Fraction(video_stream.average_rate) if video_stream and video_stream.average_rate else Fraction(1) + + # Get audio if available + audio = None + try: + container.seek(0) # Reset the container to the beginning + for stream in container.streams: + if stream.type != 'audio': + continue + assert isinstance(stream, av.AudioStream) + audio_frames = [] + for packet in container.demux(stream): + for frame in packet.decode(): + assert isinstance(frame, av.AudioFrame) + audio_frames.append(frame.to_ndarray()) # shape: (channels, samples) + if len(audio_frames) > 0: + audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples) + audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples) + audio = AudioInput({ + "waveform": audio_tensor, + "sample_rate": int(stream.sample_rate) if stream.sample_rate else 1, + }) + except StopIteration: + pass # No audio stream + + metadata = container.metadata + return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata) + + def get_components(self) -> VideoComponents: + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) # Reset the BytesIO object to the beginning + with av.open(self.__file, mode='r') as container: + return self.get_components_internal(container) + raise ValueError(f"No video stream found in file '{self.__file}'") + + def save_to( + self, + path: str | io.BytesIO, + format: VideoContainer = VideoContainer.AUTO, + codec: VideoCodec = VideoCodec.AUTO, + metadata: Optional[dict] = None + ): + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) # Reset the BytesIO object to the beginning + with av.open(self.__file, mode='r') as container: + container_format = container.format.name + video_encoding = container.streams.video[0].codec.name if len(container.streams.video) > 0 else None + reuse_streams = True + if format != VideoContainer.AUTO and format not in container_format.split(","): + reuse_streams = False + if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None: + reuse_streams = False + + if not reuse_streams: + components = self.get_components_internal(container) + video = VideoFromComponents(components) + return video.save_to( + path, + format=format, + codec=codec, + metadata=metadata + ) + + streams = container.streams + + open_kwargs = get_open_write_kwargs(path, container_format, format) + with av.open(path, **open_kwargs) as output_container: + # Copy over the original metadata + for key, value in container.metadata.items(): + if metadata is None or key not in metadata: + output_container.metadata[key] = value + + # Add our new metadata + if metadata is not None: + for key, value in metadata.items(): + if isinstance(value, str): + output_container.metadata[key] = value + else: + output_container.metadata[key] = json.dumps(value) + + # Add streams to the new container + stream_map = {} + for stream in streams: + if isinstance(stream, (av.VideoStream, av.AudioStream, SubtitleStream)): + out_stream = output_container.add_stream_from_template(template=stream, opaque=True) + stream_map[stream] = out_stream + + # Write packets to the new container + for packet in container.demux(): + if packet.stream in stream_map and packet.dts is not None: + packet.stream = stream_map[packet.stream] + output_container.mux(packet) + +class VideoFromComponents(VideoInput): + """ + Class representing video input from tensors. + """ + + def __init__(self, components: VideoComponents): + self.__components = components + + def get_components(self) -> VideoComponents: + return VideoComponents( + images=self.__components.images, + audio=self.__components.audio, + frame_rate=self.__components.frame_rate + ) + + def save_to( + self, + path: str, + format: VideoContainer = VideoContainer.AUTO, + codec: VideoCodec = VideoCodec.AUTO, + metadata: Optional[dict] = None + ): + if format != VideoContainer.AUTO and format != VideoContainer.MP4: + raise ValueError("Only MP4 format is supported for now") + if codec != VideoCodec.AUTO and codec != VideoCodec.H264: + raise ValueError("Only H264 codec is supported for now") + with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}) as output: + # Add metadata before writing any streams + if metadata is not None: + for key, value in metadata.items(): + output.metadata[key] = json.dumps(value) + + frame_rate = Fraction(round(self.__components.frame_rate * 1000), 1000) + # Create a video stream + video_stream = output.add_stream('h264', rate=frame_rate) + video_stream.width = self.__components.images.shape[2] + video_stream.height = self.__components.images.shape[1] + video_stream.pix_fmt = 'yuv420p' + + # Create an audio stream + audio_sample_rate = 1 + audio_stream: Optional[av.AudioStream] = None + if self.__components.audio: + audio_sample_rate = int(self.__components.audio['sample_rate']) + audio_stream = output.add_stream('aac', rate=audio_sample_rate) + audio_stream.sample_rate = audio_sample_rate + audio_stream.format = 'fltp' + + # Encode video + for i, frame in enumerate(self.__components.images): + img = (frame * 255).clamp(0, 255).byte().cpu().numpy() # shape: (H, W, 3) + frame = av.VideoFrame.from_ndarray(img, format='rgb24') + frame = frame.reformat(format='yuv420p') # Convert to YUV420P as required by h264 + packet = video_stream.encode(frame) + output.mux(packet) + + # Flush video + packet = video_stream.encode(None) + output.mux(packet) + + if audio_stream and self.__components.audio: + # Encode audio + samples_per_frame = int(audio_sample_rate / frame_rate) + num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame + for i in range(num_frames): + start = i * samples_per_frame + end = start + samples_per_frame + # TODO(Feature) - Add support for stereo audio + chunk = ( + self.__components.audio["waveform"][0, 0, start:end] + .unsqueeze(0) + .contiguous() + .numpy() + ) + audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono') + audio_frame.sample_rate = audio_sample_rate + audio_frame.pts = i * samples_per_frame + for packet in audio_stream.encode(audio_frame): + output.mux(packet) + + # Flush audio + for packet in audio_stream.encode(None): + output.mux(packet) + + diff --git a/comfy_api/latest/_util/__init__.py b/comfy_api/latest/_util/__init__.py new file mode 100644 index 000000000..9019c46db --- /dev/null +++ b/comfy_api/latest/_util/__init__.py @@ -0,0 +1,8 @@ +from .video_types import VideoContainer, VideoCodec, VideoComponents + +__all__ = [ + # Utility Types + "VideoContainer", + "VideoCodec", + "VideoComponents", +] diff --git a/comfy_api/latest/_util/video_types.py b/comfy_api/latest/_util/video_types.py new file mode 100644 index 000000000..c3e3d8e3a --- /dev/null +++ b/comfy_api/latest/_util/video_types.py @@ -0,0 +1,52 @@ +from __future__ import annotations +from dataclasses import dataclass +from enum import Enum +from fractions import Fraction +from typing import Optional +from comfy_api.latest._input import ImageInput, AudioInput + +class VideoCodec(str, Enum): + AUTO = "auto" + H264 = "h264" + + @classmethod + def as_input(cls) -> list[str]: + """ + Returns a list of codec names that can be used as node input. + """ + return [member.value for member in cls] + +class VideoContainer(str, Enum): + AUTO = "auto" + MP4 = "mp4" + + @classmethod + def as_input(cls) -> list[str]: + """ + Returns a list of container names that can be used as node input. + """ + return [member.value for member in cls] + + @classmethod + def get_extension(cls, value) -> str: + """ + Returns the file extension for the container. + """ + if isinstance(value, str): + value = cls(value) + if value == VideoContainer.MP4 or value == VideoContainer.AUTO: + return "mp4" + return "" + +@dataclass +class VideoComponents: + """ + Dataclass representing the components of a video. + """ + + images: ImageInput + frame_rate: Fraction + audio: Optional[AudioInput] = None + metadata: Optional[dict] = None + + diff --git a/comfy_api/latest/generated/ComfyAPISyncStub.pyi b/comfy_api/latest/generated/ComfyAPISyncStub.pyi new file mode 100644 index 000000000..525c074dd --- /dev/null +++ b/comfy_api/latest/generated/ComfyAPISyncStub.pyi @@ -0,0 +1,20 @@ +from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple +from comfy_api.latest import ComfyAPI_latest +from PIL.Image import Image +from torch import Tensor +class ComfyAPISyncStub: + def __init__(self) -> None: ... + + class ExecutionSync: + def __init__(self) -> None: ... + """ + Update the progress bar displayed in the ComfyUI interface. + + This function allows custom nodes and API calls to report their progress + back to the user interface, providing visual feedback during long operations. + + Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK + """ + def set_progress(self, value: float, max_value: float, node_id: Union[str, None] = None, preview_image: Union[Image, Tensor, None] = None, ignore_size_limit: bool = False) -> None: ... + + execution: ExecutionSync diff --git a/comfy_api/util.py b/comfy_api/util.py new file mode 100644 index 000000000..1aa9606d2 --- /dev/null +++ b/comfy_api/util.py @@ -0,0 +1,8 @@ +# This file only exists for backwards compatibility. +from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents + +__all__ = [ + "VideoCodec", + "VideoContainer", + "VideoComponents", +] diff --git a/comfy_api/util/__init__.py b/comfy_api/util/__init__.py index 9019c46db..4c8a89d1e 100644 --- a/comfy_api/util/__init__.py +++ b/comfy_api/util/__init__.py @@ -1,7 +1,7 @@ -from .video_types import VideoContainer, VideoCodec, VideoComponents +# This file only exists for backwards compatibility. +from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents __all__ = [ - # Utility Types "VideoContainer", "VideoCodec", "VideoComponents", diff --git a/comfy_api/util/video_types.py b/comfy_api/util/video_types.py index d09663db9..68c780d64 100644 --- a/comfy_api/util/video_types.py +++ b/comfy_api/util/video_types.py @@ -1,51 +1,12 @@ -from __future__ import annotations -from dataclasses import dataclass -from enum import Enum -from fractions import Fraction -from typing import Optional -from comfy_api.input import ImageInput, AudioInput - -class VideoCodec(str, Enum): - AUTO = "auto" - H264 = "h264" - - @classmethod - def as_input(cls) -> list[str]: - """ - Returns a list of codec names that can be used as node input. - """ - return [member.value for member in cls] - -class VideoContainer(str, Enum): - AUTO = "auto" - MP4 = "mp4" - - @classmethod - def as_input(cls) -> list[str]: - """ - Returns a list of container names that can be used as node input. - """ - return [member.value for member in cls] - - @classmethod - def get_extension(cls, value) -> str: - """ - Returns the file extension for the container. - """ - if isinstance(value, str): - value = cls(value) - if value == VideoContainer.MP4 or value == VideoContainer.AUTO: - return "mp4" - return "" - -@dataclass -class VideoComponents: - """ - Dataclass representing the components of a video. - """ - - images: ImageInput - frame_rate: Fraction - audio: Optional[AudioInput] = None - metadata: Optional[dict] = None +# This file only exists for backwards compatibility. +from comfy_api.latest._util.video_types import ( + VideoContainer, + VideoCodec, + VideoComponents, +) +__all__ = [ + "VideoContainer", + "VideoCodec", + "VideoComponents", +] diff --git a/comfy_api/v0_0_1/__init__.py b/comfy_api/v0_0_1/__init__.py new file mode 100644 index 000000000..93608771d --- /dev/null +++ b/comfy_api/v0_0_1/__init__.py @@ -0,0 +1,42 @@ +from comfy_api.v0_0_2 import ( + ComfyAPIAdapter_v0_0_2, + Input as Input_v0_0_2, + InputImpl as InputImpl_v0_0_2, + Types as Types_v0_0_2, +) +from typing import Type, TYPE_CHECKING +from comfy_api.internal.async_to_sync import create_sync_class + + +# This version only exists to serve as a template for future version adapters. +# There is no reason anyone should ever use it. +class ComfyAPIAdapter_v0_0_1(ComfyAPIAdapter_v0_0_2): + VERSION = "0.0.1" + STABLE = True + +class Input(Input_v0_0_2): + pass + +class InputImpl(InputImpl_v0_0_2): + pass + +class Types(Types_v0_0_2): + pass + +ComfyAPI = ComfyAPIAdapter_v0_0_1 + +# Create a synchronous version of the API +if TYPE_CHECKING: + from comfy_api.v0_0_1.generated.ComfyAPISyncStub import ComfyAPISyncStub # type: ignore + + ComfyAPISync: Type[ComfyAPISyncStub] + +ComfyAPISync = create_sync_class(ComfyAPIAdapter_v0_0_1) + +__all__ = [ + "ComfyAPI", + "ComfyAPISync", + "Input", + "InputImpl", + "Types", +] diff --git a/comfy_api/v0_0_1/generated/ComfyAPISyncStub.pyi b/comfy_api/v0_0_1/generated/ComfyAPISyncStub.pyi new file mode 100644 index 000000000..270030324 --- /dev/null +++ b/comfy_api/v0_0_1/generated/ComfyAPISyncStub.pyi @@ -0,0 +1,20 @@ +from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple +from comfy_api.v0_0_1 import ComfyAPIAdapter_v0_0_1 +from PIL.Image import Image +from torch import Tensor +class ComfyAPISyncStub: + def __init__(self) -> None: ... + + class ExecutionSync: + def __init__(self) -> None: ... + """ + Update the progress bar displayed in the ComfyUI interface. + + This function allows custom nodes and API calls to report their progress + back to the user interface, providing visual feedback during long operations. + + Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK + """ + def set_progress(self, value: float, max_value: float, node_id: Union[str, None] = None, preview_image: Union[Image, Tensor, None] = None, ignore_size_limit: bool = False) -> None: ... + + execution: ExecutionSync diff --git a/comfy_api/v0_0_2/__init__.py b/comfy_api/v0_0_2/__init__.py new file mode 100644 index 000000000..ea83833fb --- /dev/null +++ b/comfy_api/v0_0_2/__init__.py @@ -0,0 +1,43 @@ +from comfy_api.latest import ( + ComfyAPI_latest, + Input as Input_latest, + InputImpl as InputImpl_latest, + Types as Types_latest, +) +from typing import Type, TYPE_CHECKING +from comfy_api.internal.async_to_sync import create_sync_class + + +class ComfyAPIAdapter_v0_0_2(ComfyAPI_latest): + VERSION = "0.0.2" + STABLE = False + + +class Input(Input_latest): + pass + + +class InputImpl(InputImpl_latest): + pass + + +class Types(Types_latest): + pass + + +ComfyAPI = ComfyAPIAdapter_v0_0_2 + +# Create a synchronous version of the API +if TYPE_CHECKING: + from comfy_api.v0_0_2.generated.ComfyAPISyncStub import ComfyAPISyncStub # type: ignore + + ComfyAPISync: Type[ComfyAPISyncStub] +ComfyAPISync = create_sync_class(ComfyAPIAdapter_v0_0_2) + +__all__ = [ + "ComfyAPI", + "ComfyAPISync", + "Input", + "InputImpl", + "Types", +] diff --git a/comfy_api/v0_0_2/generated/ComfyAPISyncStub.pyi b/comfy_api/v0_0_2/generated/ComfyAPISyncStub.pyi new file mode 100644 index 000000000..7fcec685e --- /dev/null +++ b/comfy_api/v0_0_2/generated/ComfyAPISyncStub.pyi @@ -0,0 +1,20 @@ +from typing import Any, Dict, List, Optional, Tuple, Union, Set, Sequence, cast, NamedTuple +from comfy_api.v0_0_2 import ComfyAPIAdapter_v0_0_2 +from PIL.Image import Image +from torch import Tensor +class ComfyAPISyncStub: + def __init__(self) -> None: ... + + class ExecutionSync: + def __init__(self) -> None: ... + """ + Update the progress bar displayed in the ComfyUI interface. + + This function allows custom nodes and API calls to report their progress + back to the user interface, providing visual feedback during long operations. + + Migration from previous API: comfy.utils.PROGRESS_BAR_HOOK + """ + def set_progress(self, value: float, max_value: float, node_id: Union[str, None] = None, preview_image: Union[Image, Tensor, None] = None, ignore_size_limit: bool = False) -> None: ... + + execution: ExecutionSync diff --git a/comfy_api/version_list.py b/comfy_api/version_list.py new file mode 100644 index 000000000..7cb1871d5 --- /dev/null +++ b/comfy_api/version_list.py @@ -0,0 +1,12 @@ +from comfy_api.latest import ComfyAPI_latest +from comfy_api.v0_0_2 import ComfyAPIAdapter_v0_0_2 +from comfy_api.v0_0_1 import ComfyAPIAdapter_v0_0_1 +from comfy_api.internal import ComfyAPIBase +from typing import List, Type + +supported_versions: List[Type[ComfyAPIBase]] = [ + ComfyAPI_latest, + ComfyAPIAdapter_v0_0_2, + ComfyAPIAdapter_v0_0_1, +] + diff --git a/comfy_api_nodes/apis/request_logger.py b/comfy_api_nodes/apis/request_logger.py index 93517ede9..42901e141 100644 --- a/comfy_api_nodes/apis/request_logger.py +++ b/comfy_api_nodes/apis/request_logger.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os import datetime import json diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 5935ab2bb..af33279d5 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -2,6 +2,8 @@ API Nodes for Gemini Multimodal LLM Usage via Remote API See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference """ +from __future__ import annotations + import os from enum import Enum diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py index 73dba3f75..e8f5ede1e 100644 --- a/comfy_execution/progress.py +++ b/comfy_execution/progress.py @@ -1,4 +1,6 @@ -from typing import TypedDict, Dict, Optional +from __future__ import annotations + +from typing import TypedDict, Dict, Optional, Tuple from typing_extensions import override from PIL import Image from enum import Enum @@ -10,6 +12,7 @@ if TYPE_CHECKING: from protocol import BinaryEventTypes from comfy_api import feature_flags +PreviewImageTuple = Tuple[str, Image.Image, Optional[int]] class NodeState(Enum): Pending = "pending" @@ -52,7 +55,7 @@ class ProgressHandler(ABC): max_value: float, state: NodeProgressState, prompt_id: str, - image: Optional[Image.Image] = None, + image: PreviewImageTuple | None = None, ): """Called when a node's progress is updated""" pass @@ -103,7 +106,7 @@ class CLIProgressHandler(ProgressHandler): max_value: float, state: NodeProgressState, prompt_id: str, - image: Optional[Image.Image] = None, + image: PreviewImageTuple | None = None, ): # Handle case where start_handler wasn't called if node_id not in self.progress_bars: @@ -196,7 +199,7 @@ class WebUIProgressHandler(ProgressHandler): max_value: float, state: NodeProgressState, prompt_id: str, - image: Optional[Image.Image] = None, + image: PreviewImageTuple | None = None, ): # Send progress state of all nodes if self.registry: @@ -231,7 +234,6 @@ class WebUIProgressHandler(ProgressHandler): if self.registry: self._send_progress_state(prompt_id, self.registry.nodes) - class ProgressRegistry: """ Registry that maintains node progress state and notifies registered handlers. @@ -285,7 +287,7 @@ class ProgressRegistry: handler.start_handler(node_id, entry, self.prompt_id) def update_progress( - self, node_id: str, value: float, max_value: float, image: Optional[Image.Image] + self, node_id: str, value: float, max_value: float, image: PreviewImageTuple | None = None ) -> None: """Update progress for a node""" entry = self.ensure_entry(node_id) @@ -317,7 +319,7 @@ class ProgressRegistry: handler.reset() # Global registry instance -global_progress_registry: ProgressRegistry = None +global_progress_registry: ProgressRegistry | None = None def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt") -> None: global global_progress_registry diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index 61f7171b2..969f888b9 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -8,9 +8,7 @@ import json from typing import Optional, Literal from fractions import Fraction from comfy.comfy_types import IO, FileLocator, ComfyNodeABC -from comfy_api.input import ImageInput, AudioInput, VideoInput -from comfy_api.util import VideoContainer, VideoCodec, VideoComponents -from comfy_api.input_impl import VideoFromFile, VideoFromComponents +from comfy_api.latest import Input, InputImpl, Types from comfy.cli_args import args class SaveWEBM: @@ -91,8 +89,8 @@ class SaveVideo(ComfyNodeABC): "required": { "video": (IO.VIDEO, {"tooltip": "The video to save."}), "filename_prefix": ("STRING", {"default": "video/ComfyUI", "tooltip": "The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."}), - "format": (VideoContainer.as_input(), {"default": "auto", "tooltip": "The format to save the video as."}), - "codec": (VideoCodec.as_input(), {"default": "auto", "tooltip": "The codec to use for the video."}), + "format": (Types.VideoContainer.as_input(), {"default": "auto", "tooltip": "The format to save the video as."}), + "codec": (Types.VideoCodec.as_input(), {"default": "auto", "tooltip": "The codec to use for the video."}), }, "hidden": { "prompt": "PROMPT", @@ -108,7 +106,7 @@ class SaveVideo(ComfyNodeABC): CATEGORY = "image/video" DESCRIPTION = "Saves the input images to your ComfyUI output directory." - def save_video(self, video: VideoInput, filename_prefix, format, codec, prompt=None, extra_pnginfo=None): + def save_video(self, video: Input.Video, filename_prefix, format, codec, prompt=None, extra_pnginfo=None): filename_prefix += self.prefix_append width, height = video.get_dimensions() full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( @@ -127,7 +125,7 @@ class SaveVideo(ComfyNodeABC): metadata["prompt"] = prompt if len(metadata) > 0: saved_metadata = metadata - file = f"{filename}_{counter:05}_.{VideoContainer.get_extension(format)}" + file = f"{filename}_{counter:05}_.{Types.VideoContainer.get_extension(format)}" video.save_to( os.path.join(full_output_folder, file), format=format, @@ -163,9 +161,9 @@ class CreateVideo(ComfyNodeABC): CATEGORY = "image/video" DESCRIPTION = "Create a video from images." - def create_video(self, images: ImageInput, fps: float, audio: Optional[AudioInput] = None): - return (VideoFromComponents( - VideoComponents( + def create_video(self, images: Input.Image, fps: float, audio: Optional[Input.Audio] = None): + return (InputImpl.VideoFromComponents( + Types.VideoComponents( images=images, audio=audio, frame_rate=Fraction(fps), @@ -187,7 +185,7 @@ class GetVideoComponents(ComfyNodeABC): CATEGORY = "image/video" DESCRIPTION = "Extracts all components from a video: frames, audio, and framerate." - def get_components(self, video: VideoInput): + def get_components(self, video: Input.Video): components = video.get_components() return (components.images, components.audio, float(components.frame_rate)) @@ -208,7 +206,7 @@ class LoadVideo(ComfyNodeABC): FUNCTION = "load_video" def load_video(self, file): video_path = folder_paths.get_annotated_filepath(file) - return (VideoFromFile(video_path),) + return (InputImpl.VideoFromFile(video_path),) @classmethod def IS_CHANGED(cls, file): @@ -239,3 +237,4 @@ NODE_DISPLAY_NAME_MAPPINGS = { "GetVideoComponents": "Get Video Components", "LoadVideo": "Load Video", } + diff --git a/main.py b/main.py index e8ca8152a..9b2a33011 100644 --- a/main.py +++ b/main.py @@ -313,10 +313,10 @@ def start_comfyui(asyncio_loop=None): prompt_server = server.PromptServer(asyncio_loop) hook_breaker_ac10a0.save_functions() - nodes.init_extra_nodes( + asyncio_loop.run_until_complete(nodes.init_extra_nodes( init_custom_nodes=(not args.disable_all_custom_nodes) or len(args.whitelist_custom_nodes) > 0, init_api_nodes=not args.disable_api_nodes - ) + )) hook_breaker_ac10a0.restore_functions() cuda_malloc_warning() diff --git a/nodes.py b/nodes.py index 231d4d4de..54e530388 100644 --- a/nodes.py +++ b/nodes.py @@ -1,6 +1,7 @@ from __future__ import annotations import torch + import os import sys import json @@ -26,6 +27,8 @@ import comfy.sd import comfy.utils import comfy.controlnet from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict, FileLocator +from comfy_api.internal import register_versions, ComfyAPIWithVersion +from comfy_api.version_list import supported_versions import comfy.clip_vision @@ -2101,7 +2104,7 @@ def get_module_name(module_path: str) -> str: return base_path -def load_custom_node(module_path: str, ignore=set(), module_parent="custom_nodes") -> bool: +async def load_custom_node(module_path: str, ignore=set(), module_parent="custom_nodes") -> bool: module_name = get_module_name(module_path) if os.path.isfile(module_path): sp = os.path.splitext(module_path) @@ -2165,7 +2168,7 @@ def load_custom_node(module_path: str, ignore=set(), module_parent="custom_nodes logging.warning(f"Cannot import {module_path} module for custom nodes: {e}") return False -def init_external_custom_nodes(): +async def init_external_custom_nodes(): """ Initializes the external custom nodes. @@ -2191,7 +2194,7 @@ def init_external_custom_nodes(): logging.info(f"Skipping {possible_module} due to disable_all_custom_nodes and whitelist_custom_nodes") continue time_before = time.perf_counter() - success = load_custom_node(module_path, base_node_names, module_parent="custom_nodes") + success = await load_custom_node(module_path, base_node_names, module_parent="custom_nodes") node_import_times.append((time.perf_counter() - time_before, module_path, success)) if len(node_import_times) > 0: @@ -2204,7 +2207,7 @@ def init_external_custom_nodes(): logging.info("{:6.1f} seconds{}: {}".format(n[0], import_message, n[1])) logging.info("") -def init_builtin_extra_nodes(): +async def init_builtin_extra_nodes(): """ Initializes the built-in extra nodes in ComfyUI. @@ -2288,13 +2291,13 @@ def init_builtin_extra_nodes(): import_failed = [] for node_file in extras_files: - if not load_custom_node(os.path.join(extras_dir, node_file), module_parent="comfy_extras"): + if not await load_custom_node(os.path.join(extras_dir, node_file), module_parent="comfy_extras"): import_failed.append(node_file) return import_failed -def init_builtin_api_nodes(): +async def init_builtin_api_nodes(): api_nodes_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_api_nodes") api_nodes_files = [ "nodes_ideogram.py", @@ -2315,26 +2318,35 @@ def init_builtin_api_nodes(): "nodes_gemini.py", ] - if not load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"): + if not await load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"): return api_nodes_files import_failed = [] for node_file in api_nodes_files: - if not load_custom_node(os.path.join(api_nodes_dir, node_file), module_parent="comfy_api_nodes"): + if not await load_custom_node(os.path.join(api_nodes_dir, node_file), module_parent="comfy_api_nodes"): import_failed.append(node_file) return import_failed +async def init_public_apis(): + register_versions([ + ComfyAPIWithVersion( + version=getattr(v, "VERSION"), + api_class=v + ) for v in supported_versions + ]) -def init_extra_nodes(init_custom_nodes=True, init_api_nodes=True): - import_failed = init_builtin_extra_nodes() +async def init_extra_nodes(init_custom_nodes=True, init_api_nodes=True): + await init_public_apis() + + import_failed = await init_builtin_extra_nodes() import_failed_api = [] if init_api_nodes: - import_failed_api = init_builtin_api_nodes() + import_failed_api = await init_builtin_api_nodes() if init_custom_nodes: - init_external_custom_nodes() + await init_external_custom_nodes() else: logging.info("Skipping loading of custom nodes") diff --git a/pyproject.toml b/pyproject.toml index 723c93069..244fdd232 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,4 +21,4 @@ lint.select = [ # See all rules here: https://docs.astral.sh/ruff/rules/#pyflakes-f "F", ] -exclude = ["*.ipynb"] +exclude = ["*.ipynb", "**/generated/*.pyi"] diff --git a/tests/inference/testing_nodes/testing-pack/__init__.py b/tests/inference/testing_nodes/testing-pack/__init__.py index 20f9533c7..3d5ac8a94 100644 --- a/tests/inference/testing_nodes/testing-pack/__init__.py +++ b/tests/inference/testing_nodes/testing-pack/__init__.py @@ -4,6 +4,7 @@ from .util import UTILITY_NODE_CLASS_MAPPINGS, UTILITY_NODE_DISPLAY_NAME_MAPPING from .conditions import CONDITION_NODE_CLASS_MAPPINGS, CONDITION_NODE_DISPLAY_NAME_MAPPINGS from .stubs import TEST_STUB_NODE_CLASS_MAPPINGS, TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS from .async_test_nodes import ASYNC_TEST_NODE_CLASS_MAPPINGS, ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS +from .api_test_nodes import API_TEST_NODE_CLASS_MAPPINGS, API_TEST_NODE_DISPLAY_NAME_MAPPINGS # NODE_CLASS_MAPPINGS = GENERAL_NODE_CLASS_MAPPINGS.update(COMPONENT_NODE_CLASS_MAPPINGS) # NODE_DISPLAY_NAME_MAPPINGS = GENERAL_NODE_DISPLAY_NAME_MAPPINGS.update(COMPONENT_NODE_DISPLAY_NAME_MAPPINGS) @@ -15,6 +16,7 @@ NODE_CLASS_MAPPINGS.update(UTILITY_NODE_CLASS_MAPPINGS) NODE_CLASS_MAPPINGS.update(CONDITION_NODE_CLASS_MAPPINGS) NODE_CLASS_MAPPINGS.update(TEST_STUB_NODE_CLASS_MAPPINGS) NODE_CLASS_MAPPINGS.update(ASYNC_TEST_NODE_CLASS_MAPPINGS) +NODE_CLASS_MAPPINGS.update(API_TEST_NODE_CLASS_MAPPINGS) NODE_DISPLAY_NAME_MAPPINGS = {} NODE_DISPLAY_NAME_MAPPINGS.update(TEST_NODE_DISPLAY_NAME_MAPPINGS) @@ -23,4 +25,4 @@ NODE_DISPLAY_NAME_MAPPINGS.update(UTILITY_NODE_DISPLAY_NAME_MAPPINGS) NODE_DISPLAY_NAME_MAPPINGS.update(CONDITION_NODE_DISPLAY_NAME_MAPPINGS) NODE_DISPLAY_NAME_MAPPINGS.update(TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS) NODE_DISPLAY_NAME_MAPPINGS.update(ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS) - +NODE_DISPLAY_NAME_MAPPINGS.update(API_TEST_NODE_DISPLAY_NAME_MAPPINGS) diff --git a/tests/inference/testing_nodes/testing-pack/api_test_nodes.py b/tests/inference/testing_nodes/testing-pack/api_test_nodes.py new file mode 100644 index 000000000..b2eaae05e --- /dev/null +++ b/tests/inference/testing_nodes/testing-pack/api_test_nodes.py @@ -0,0 +1,78 @@ +import asyncio +import time +from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict +from comfy_api.v0_0_2 import ComfyAPI, ComfyAPISync + +api = ComfyAPI() +api_sync = ComfyAPISync() + + +class TestAsyncProgressUpdate(ComfyNodeABC): + """Test node with async VALIDATE_INPUTS.""" + + @classmethod + def INPUT_TYPES(cls) -> InputTypeDict: + return { + "required": { + "value": (IO.ANY, {}), + "sleep_seconds": (IO.FLOAT, {"default": 1.0}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "execute" + CATEGORY = "_for_testing/async" + + async def execute(self, value, sleep_seconds): + start = time.time() + expiration = start + sleep_seconds + now = start + while now < expiration: + now = time.time() + await api.execution.set_progress( + value=(now - start) / sleep_seconds, + max_value=1.0, + ) + await asyncio.sleep(0.01) + return (value,) + + +class TestSyncProgressUpdate(ComfyNodeABC): + """Test node with async VALIDATE_INPUTS.""" + + @classmethod + def INPUT_TYPES(cls) -> InputTypeDict: + return { + "required": { + "value": (IO.ANY, {}), + "sleep_seconds": (IO.FLOAT, {"default": 1.0}), + }, + } + + RETURN_TYPES = (IO.ANY,) + FUNCTION = "execute" + CATEGORY = "_for_testing/async" + + def execute(self, value, sleep_seconds): + start = time.time() + expiration = start + sleep_seconds + now = start + while now < expiration: + now = time.time() + api_sync.execution.set_progress( + value=(now - start) / sleep_seconds, + max_value=1.0, + ) + time.sleep(0.01) + return (value,) + + +API_TEST_NODE_CLASS_MAPPINGS = { + "TestAsyncProgressUpdate": TestAsyncProgressUpdate, + "TestSyncProgressUpdate": TestSyncProgressUpdate, +} + +API_TEST_NODE_DISPLAY_NAME_MAPPINGS = { + "TestAsyncProgressUpdate": "Async Progress Update Test Node", + "TestSyncProgressUpdate": "Sync Progress Update Test Node", +} From d2aaef029cfb60611f2c9aad1b5dfb7070f9c162 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 30 Jul 2025 10:50:49 +0800 Subject: [PATCH 077/759] Update template to 0.1.44 (#9104) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 14a085a2c..8f2f6a56c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.41 +comfyui-workflow-templates==0.1.44 comfyui-embedded-docs==0.2.4 torch torchsde From da9dab7edd36fa75e010d2c8498e1ba23b8b97dd Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 30 Jul 2025 02:55:26 -0700 Subject: [PATCH 078/759] Small wan camera memory optimization. (#9111) --- comfy/ldm/wan/model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index a93a13c86..86d0795e9 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -769,8 +769,7 @@ class CameraWanModel(WanModel): # embeddings x = self.patch_embedding(x.float()).to(x.dtype) if self.control_adapter is not None and camera_conditions is not None: - x_camera = self.control_adapter(camera_conditions).to(x.dtype) - x = x + x_camera + x = x + self.control_adapter(camera_conditions).to(x.dtype) grid_sizes = x.shape[2:] x = x.flatten(2).transpose(1, 2) From 61b08d4ba65fec37070376bf50da3ec3c534e859 Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Thu, 31 Jul 2025 07:25:56 +0800 Subject: [PATCH 079/759] Replace manual x * sigmoid(x) with torch silu in VAE nonlinearity (#9057) --- comfy/ldm/cosmos/cosmos_tokenizer/utils.py | 3 ++- comfy/ldm/modules/diffusionmodules/model.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/cosmos/cosmos_tokenizer/utils.py b/comfy/ldm/cosmos/cosmos_tokenizer/utils.py index 3af8d0d05..ca993006f 100644 --- a/comfy/ldm/cosmos/cosmos_tokenizer/utils.py +++ b/comfy/ldm/cosmos/cosmos_tokenizer/utils.py @@ -58,7 +58,8 @@ def is_odd(n: int) -> bool: def nonlinearity(x): - return x * torch.sigmoid(x) + # x * sigmoid(x) + return torch.nn.functional.silu(x) def Normalize(in_channels, num_groups=32): diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index 8162742cf..5c0373b74 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -36,7 +36,7 @@ def get_timestep_embedding(timesteps, embedding_dim): def nonlinearity(x): # swish - return x*torch.sigmoid(x) + return torch.nn.functional.silu(x) def Normalize(in_channels, num_groups=32): From 97eb256a355b434bbc96ec27bbce33dd10273857 Mon Sep 17 00:00:00 2001 From: guill Date: Wed, 30 Jul 2025 19:55:28 -0700 Subject: [PATCH 080/759] Add support for partial execution in backend (#9123) When a prompt is submitted, it can optionally include `partial_execution_targets` as a list of ids. If it does, rather than adding all outputs to the execution list, we add only those in the list. --- execution.py | 7 +- server.py | 7 +- tests/inference/test_async_nodes.py | 15 +- tests/inference/test_execution.py | 202 ++++++++++++++++-- .../testing-pack/specific_tests.py | 21 ++ 5 files changed, 233 insertions(+), 19 deletions(-) diff --git a/execution.py b/execution.py index 8a9663a7d..cde14c52f 100644 --- a/execution.py +++ b/execution.py @@ -7,7 +7,7 @@ import threading import time import traceback from enum import Enum -from typing import List, Literal, NamedTuple, Optional +from typing import List, Literal, NamedTuple, Optional, Union import asyncio import torch @@ -891,7 +891,7 @@ def full_type_name(klass): return klass.__qualname__ return module + '.' + klass.__qualname__ -async def validate_prompt(prompt_id, prompt): +async def validate_prompt(prompt_id, prompt, partial_execution_list: Union[list[str], None]): outputs = set() for x in prompt: if 'class_type' not in prompt[x]: @@ -915,7 +915,8 @@ async def validate_prompt(prompt_id, prompt): return (False, error, [], {}) if hasattr(class_, 'OUTPUT_NODE') and class_.OUTPUT_NODE is True: - outputs.add(x) + if partial_execution_list is None or x in partial_execution_list: + outputs.add(x) if len(outputs) == 0: error = { diff --git a/server.py b/server.py index f4de0079b..3e06d2fbb 100644 --- a/server.py +++ b/server.py @@ -681,7 +681,12 @@ class PromptServer(): if "prompt" in json_data: prompt = json_data["prompt"] prompt_id = str(json_data.get("prompt_id", uuid.uuid4())) - valid = await execution.validate_prompt(prompt_id, prompt) + + partial_execution_targets = None + if "partial_execution_targets" in json_data: + partial_execution_targets = json_data["partial_execution_targets"] + + valid = await execution.validate_prompt(prompt_id, prompt, partial_execution_targets) extra_data = {} if "extra_data" in json_data: extra_data = json_data["extra_data"] diff --git a/tests/inference/test_async_nodes.py b/tests/inference/test_async_nodes.py index b243bbca9..f029953dd 100644 --- a/tests/inference/test_async_nodes.py +++ b/tests/inference/test_async_nodes.py @@ -7,7 +7,7 @@ import subprocess from pytest import fixture from comfy_execution.graph_utils import GraphBuilder -from tests.inference.test_execution import ComfyClient +from tests.inference.test_execution import ComfyClient, run_warmup @pytest.mark.execution @@ -24,6 +24,7 @@ class TestAsyncNodes: '--listen', args_pytest["listen"], '--port', str(args_pytest["port"]), '--extra-model-paths-config', 'tests/inference/extra_model_paths.yaml', + '--cpu', ] use_lru, lru_size = request.param if use_lru: @@ -82,6 +83,9 @@ class TestAsyncNodes: def test_multiple_async_parallel_execution(self, client: ComfyClient, builder: GraphBuilder): """Test that multiple async nodes execute in parallel.""" + # Warmup execution to ensure server is fully initialized + run_warmup(client) + g = builder image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) @@ -148,6 +152,9 @@ class TestAsyncNodes: def test_async_lazy_evaluation(self, client: ComfyClient, builder: GraphBuilder): """Test async nodes with lazy evaluation.""" + # Warmup execution to ensure server is fully initialized + run_warmup(client, prefix="warmup_lazy") + g = builder input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) @@ -305,6 +312,9 @@ class TestAsyncNodes: def test_async_caching_behavior(self, client: ComfyClient, builder: GraphBuilder): """Test that async nodes are properly cached.""" + # Warmup execution to ensure server is fully initialized + run_warmup(client, prefix="warmup_cache") + g = builder image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) sleep_node = g.node("TestSleep", value=image.out(0), seconds=0.2) @@ -324,6 +334,9 @@ class TestAsyncNodes: def test_async_with_dynamic_prompts(self, client: ComfyClient, builder: GraphBuilder): """Test async nodes within dynamically generated prompts.""" + # Warmup execution to ensure server is fully initialized + run_warmup(client, prefix="warmup_dynamic") + g = builder image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) diff --git a/tests/inference/test_execution.py b/tests/inference/test_execution.py index 9d3d685cc..e7b29302e 100644 --- a/tests/inference/test_execution.py +++ b/tests/inference/test_execution.py @@ -15,10 +15,18 @@ import urllib.parse import urllib.error from comfy_execution.graph_utils import GraphBuilder, Node +def run_warmup(client, prefix="warmup"): + """Run a simple workflow to warm up the server.""" + warmup_g = GraphBuilder(prefix=prefix) + warmup_image = warmup_g.node("StubImage", content="BLACK", height=32, width=32, batch_size=1) + warmup_g.node("PreviewImage", images=warmup_image.out(0)) + client.run(warmup_g) + class RunResult: def __init__(self, prompt_id: str): self.outputs: Dict[str,Dict] = {} self.runs: Dict[str,bool] = {} + self.cached: Dict[str,bool] = {} self.prompt_id: str = prompt_id def get_output(self, node: Node): @@ -27,6 +35,13 @@ class RunResult: def did_run(self, node: Node): return self.runs.get(node.id, False) + def was_cached(self, node: Node): + return self.cached.get(node.id, False) + + def was_executed(self, node: Node): + """Returns True if node was either run or cached""" + return self.did_run(node) or self.was_cached(node) + def get_images(self, node: Node): output = self.get_output(node) if output is None: @@ -51,8 +66,10 @@ class ComfyClient: ws.connect("ws://{}/ws?clientId={}".format(self.server_address, self.client_id)) self.ws = ws - def queue_prompt(self, prompt): + def queue_prompt(self, prompt, partial_execution_targets=None): p = {"prompt": prompt, "client_id": self.client_id} + if partial_execution_targets is not None: + p["partial_execution_targets"] = partial_execution_targets data = json.dumps(p).encode('utf-8') req = urllib.request.Request("http://{}/prompt".format(self.server_address), data=data) return json.loads(urllib.request.urlopen(req).read()) @@ -70,13 +87,13 @@ class ComfyClient: def set_test_name(self, name): self.test_name = name - def run(self, graph): + def run(self, graph, partial_execution_targets=None): prompt = graph.finalize() for node in graph.nodes.values(): if node.class_type == 'SaveImage': node.inputs['filename_prefix'] = self.test_name - prompt_id = self.queue_prompt(prompt)['prompt_id'] + prompt_id = self.queue_prompt(prompt, partial_execution_targets)['prompt_id'] result = RunResult(prompt_id) while True: out = self.ws.recv() @@ -92,7 +109,10 @@ class ComfyClient: elif message['type'] == 'execution_error': raise Exception(message['data']) elif message['type'] == 'execution_cached': - pass # Probably want to store this off for testing + if message['data']['prompt_id'] == prompt_id: + cached_nodes = message['data'].get('nodes', []) + for node_id in cached_nodes: + result.cached[node_id] = True history = self.get_history(prompt_id)[prompt_id] for node_id in history['outputs']: @@ -130,6 +150,7 @@ class TestExecution: '--listen', args_pytest["listen"], '--port', str(args_pytest["port"]), '--extra-model-paths-config', 'tests/inference/extra_model_paths.yaml', + '--cpu', ] use_lru, lru_size = request.param if use_lru: @@ -498,12 +519,15 @@ class TestExecution: assert not result.did_run(test_node), "The execution should have been cached" def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder): + # Warmup execution to ensure server is fully initialized + run_warmup(client) + g = builder image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) # Create sleep nodes for each duration - sleep_node1 = g.node("TestSleep", value=image.out(0), seconds=2.8) - sleep_node2 = g.node("TestSleep", value=image.out(0), seconds=2.9) + sleep_node1 = g.node("TestSleep", value=image.out(0), seconds=2.9) + sleep_node2 = g.node("TestSleep", value=image.out(0), seconds=3.1) sleep_node3 = g.node("TestSleep", value=image.out(0), seconds=3.0) # Add outputs to verify the execution @@ -515,10 +539,9 @@ class TestExecution: result = client.run(g) elapsed_time = time.time() - start_time - # The test should take around 0.4 seconds (the longest sleep duration) - # plus some overhead, but definitely less than the sum of all sleeps (0.9s) - # We'll allow for up to 0.8s total to account for overhead - assert elapsed_time < 4.0, f"Parallel execution took {elapsed_time}s, expected less than 0.8s" + # The test should take around 3.0 seconds (the longest sleep duration) + # plus some overhead, but definitely less than the sum of all sleeps (9.0s) + assert elapsed_time < 8.9, f"Parallel execution took {elapsed_time}s, expected less than 8.9s" # Verify that all nodes executed assert result.did_run(sleep_node1), "Sleep node 1 should have run" @@ -526,6 +549,9 @@ class TestExecution: assert result.did_run(sleep_node3), "Sleep node 3 should have run" def test_parallel_sleep_expansion(self, client: ComfyClient, builder: GraphBuilder): + # Warmup execution to ensure server is fully initialized + run_warmup(client) + g = builder # Create input images with different values image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) @@ -537,9 +563,9 @@ class TestExecution: image1=image1.out(0), image2=image2.out(0), image3=image3.out(0), - sleep1=0.4, - sleep2=0.5, - sleep3=0.6) + sleep1=4.8, + sleep2=4.9, + sleep3=5.0) output = g.node("SaveImage", images=parallel_sleep.out(0)) start_time = time.time() @@ -548,7 +574,7 @@ class TestExecution: # Similar to the previous test, expect parallel execution of the sleep nodes # which should complete in less than the sum of all sleeps - assert elapsed_time < 0.8, f"Expansion execution took {elapsed_time}s, expected less than 0.8s" + assert elapsed_time < 10.0, f"Expansion execution took {elapsed_time}s, expected less than 5.5s" # Verify the parallel sleep node executed assert result.did_run(parallel_sleep), "ParallelSleep node should have run" @@ -585,3 +611,151 @@ class TestExecution: assert len(images) == 2, "Should have 2 images" assert numpy.array(images[0]).min() == 0 and numpy.array(images[0]).max() == 0, "First image should be black" assert numpy.array(images[1]).min() == 0 and numpy.array(images[1]).max() == 0, "Second image should also be black" + + # Output nodes included in the partial execution list are executed + def test_partial_execution_included_outputs(self, client: ComfyClient, builder: GraphBuilder): + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + + # Create two separate output nodes + output1 = g.node("SaveImage", images=input1.out(0)) + output2 = g.node("SaveImage", images=input2.out(0)) + + # Run with partial execution targeting only output1 + result = client.run(g, partial_execution_targets=[output1.id]) + + assert result.was_executed(input1), "Input1 should have been executed (run or cached)" + assert result.was_executed(output1), "Output1 should have been executed (run or cached)" + assert not result.did_run(input2), "Input2 should not have run" + assert not result.did_run(output2), "Output2 should not have run" + + # Verify only output1 produced results + assert len(result.get_images(output1)) == 1, "Output1 should have produced an image" + assert len(result.get_images(output2)) == 0, "Output2 should not have produced an image" + + # Output nodes NOT included in the partial execution list are NOT executed + def test_partial_execution_excluded_outputs(self, client: ComfyClient, builder: GraphBuilder): + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + input3 = g.node("StubImage", content="NOISE", height=512, width=512, batch_size=1) + + # Create three output nodes + output1 = g.node("SaveImage", images=input1.out(0)) + output2 = g.node("SaveImage", images=input2.out(0)) + output3 = g.node("SaveImage", images=input3.out(0)) + + # Run with partial execution targeting only output1 and output3 + result = client.run(g, partial_execution_targets=[output1.id, output3.id]) + + assert result.was_executed(input1), "Input1 should have been executed" + assert result.was_executed(input3), "Input3 should have been executed" + assert result.was_executed(output1), "Output1 should have been executed" + assert result.was_executed(output3), "Output3 should have been executed" + assert not result.did_run(input2), "Input2 should not have run" + assert not result.did_run(output2), "Output2 should not have run" + + # Output nodes NOT in list ARE executed if necessary for nodes that are in the list + def test_partial_execution_dependencies(self, client: ComfyClient, builder: GraphBuilder): + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + + # Create a processing chain with an OUTPUT_NODE that has socket outputs + output_with_socket = g.node("TestOutputNodeWithSocketOutput", image=input1.out(0), value=2.0) + + # Create another node that depends on the output_with_socket + dependent_node = g.node("TestLazyMixImages", + image1=output_with_socket.out(0), + image2=input1.out(0), + mask=g.node("StubMask", value=0.5, height=512, width=512, batch_size=1).out(0)) + + # Create the final output + final_output = g.node("SaveImage", images=dependent_node.out(0)) + + # Run with partial execution targeting only the final output + result = client.run(g, partial_execution_targets=[final_output.id]) + + # All nodes should have been executed because they're dependencies + assert result.was_executed(input1), "Input1 should have been executed" + assert result.was_executed(output_with_socket), "Output with socket should have been executed (dependency)" + assert result.was_executed(dependent_node), "Dependent node should have been executed" + assert result.was_executed(final_output), "Final output should have been executed" + + # Lazy execution works with partial execution + def test_partial_execution_with_lazy_nodes(self, client: ComfyClient, builder: GraphBuilder): + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + input3 = g.node("StubImage", content="NOISE", height=512, width=512, batch_size=1) + + # Create masks that will trigger different lazy execution paths + mask1 = g.node("StubMask", value=0.0, height=512, width=512, batch_size=1) # Will only need image1 + mask2 = g.node("StubMask", value=0.5, height=512, width=512, batch_size=1) # Will need both images + + # Create two lazy mix nodes + lazy_mix1 = g.node("TestLazyMixImages", image1=input1.out(0), image2=input2.out(0), mask=mask1.out(0)) + lazy_mix2 = g.node("TestLazyMixImages", image1=input2.out(0), image2=input3.out(0), mask=mask2.out(0)) + + output1 = g.node("SaveImage", images=lazy_mix1.out(0)) + output2 = g.node("SaveImage", images=lazy_mix2.out(0)) + + # Run with partial execution targeting only output1 + result = client.run(g, partial_execution_targets=[output1.id]) + + # For output1 path - only input1 should run due to lazy evaluation (mask=0.0) + assert result.was_executed(input1), "Input1 should have been executed" + assert not result.did_run(input2), "Input2 should not have run (lazy evaluation)" + assert result.was_executed(mask1), "Mask1 should have been executed" + assert result.was_executed(lazy_mix1), "Lazy mix1 should have been executed" + assert result.was_executed(output1), "Output1 should have been executed" + + # Nothing from output2 path should run + assert not result.did_run(input3), "Input3 should not have run" + assert not result.did_run(mask2), "Mask2 should not have run" + assert not result.did_run(lazy_mix2), "Lazy mix2 should not have run" + assert not result.did_run(output2), "Output2 should not have run" + + # Multiple OUTPUT_NODEs with dependencies + def test_partial_execution_multiple_output_nodes(self, client: ComfyClient, builder: GraphBuilder): + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1) + + # Create a chain of OUTPUT_NODEs + output_node1 = g.node("TestOutputNodeWithSocketOutput", image=input1.out(0), value=1.5) + output_node2 = g.node("TestOutputNodeWithSocketOutput", image=output_node1.out(0), value=2.0) + + # Create regular output nodes + save1 = g.node("SaveImage", images=output_node1.out(0)) + save2 = g.node("SaveImage", images=output_node2.out(0)) + save3 = g.node("SaveImage", images=input2.out(0)) + + # Run targeting only save2 + result = client.run(g, partial_execution_targets=[save2.id]) + + # Should run: input1, output_node1, output_node2, save2 + assert result.was_executed(input1), "Input1 should have been executed" + assert result.was_executed(output_node1), "Output node 1 should have been executed (dependency)" + assert result.was_executed(output_node2), "Output node 2 should have been executed (dependency)" + assert result.was_executed(save2), "Save2 should have been executed" + + # Should NOT run: input2, save1, save3 + assert not result.did_run(input2), "Input2 should not have run" + assert not result.did_run(save1), "Save1 should not have run" + assert not result.did_run(save3), "Save3 should not have run" + + # Empty partial execution list (should execute nothing) + def test_partial_execution_empty_list(self, client: ComfyClient, builder: GraphBuilder): + g = builder + input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) + _output1 = g.node("SaveImage", images=input1.out(0)) + + # Run with empty partial execution list + try: + _result = client.run(g, partial_execution_targets=[]) + # Should get an error because no outputs are selected + assert False, "Should have raised an error for empty partial execution list" + except urllib.error.HTTPError: + pass # Expected behavior + diff --git a/tests/inference/testing_nodes/testing-pack/specific_tests.py b/tests/inference/testing_nodes/testing-pack/specific_tests.py index 657d49f2f..4f8f01ae4 100644 --- a/tests/inference/testing_nodes/testing-pack/specific_tests.py +++ b/tests/inference/testing_nodes/testing-pack/specific_tests.py @@ -463,6 +463,25 @@ class TestParallelSleep(ComfyNodeABC): "expand": g.finalize(), } +class TestOutputNodeWithSocketOutput: + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "image": ("IMAGE",), + "value": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0}), + }, + } + RETURN_TYPES = ("IMAGE",) + FUNCTION = "process" + CATEGORY = "_for_testing" + OUTPUT_NODE = True + + def process(self, image, value): + # Apply value scaling and return both as output and socket + result = image * value + return (result,) + TEST_NODE_CLASS_MAPPINGS = { "TestLazyMixImages": TestLazyMixImages, "TestVariadicAverage": TestVariadicAverage, @@ -478,6 +497,7 @@ TEST_NODE_CLASS_MAPPINGS = { "TestSamplingInExpansion": TestSamplingInExpansion, "TestSleep": TestSleep, "TestParallelSleep": TestParallelSleep, + "TestOutputNodeWithSocketOutput": TestOutputNodeWithSocketOutput, } TEST_NODE_DISPLAY_NAME_MAPPINGS = { @@ -495,4 +515,5 @@ TEST_NODE_DISPLAY_NAME_MAPPINGS = { "TestSamplingInExpansion": "Sampling In Expansion", "TestSleep": "Test Sleep", "TestParallelSleep": "Test Parallel Sleep", + "TestOutputNodeWithSocketOutput": "Test Output Node With Socket Output", } From 97b8a2c26a335fe70ac6cfb44bf225454f51d700 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 31 Jul 2025 02:46:23 -0700 Subject: [PATCH 081/759] More accurate explanation of release process. (#9126) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index befc4c006..2abd8e600 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,7 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git ## Release Process -ComfyUI follows a weekly release cycle every Friday, with three interconnected repositories: +ComfyUI follows a weekly release cycle targeting Friday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories: 1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)** - Releases a new stable version (e.g., v0.7.0) From 4887743a2aef67e05909aeea61f6cdc93e269de3 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Thu, 31 Jul 2025 15:02:12 -0700 Subject: [PATCH 082/759] V3 Node Schema Definition - initial (#8656) --- comfy_api/internal/__init__.py | 143 +++ comfy_api/latest/__init__.py | 18 + comfy_api/latest/_io.py | 1618 ++++++++++++++++++++++++++++++++ comfy_api/latest/_resources.py | 72 ++ comfy_api/latest/_ui.py | 457 +++++++++ comfy_api/v0_0_2/__init__.py | 2 + comfy_execution/graph.py | 23 +- comfy_execution/graph_utils.py | 16 + execution.py | 140 ++- nodes.py | 37 +- server.py | 3 + 11 files changed, 2475 insertions(+), 54 deletions(-) create mode 100644 comfy_api/latest/_io.py create mode 100644 comfy_api/latest/_resources.py create mode 100644 comfy_api/latest/_ui.py diff --git a/comfy_api/internal/__init__.py b/comfy_api/internal/__init__.py index c00b1fdbb..4ca02e320 100644 --- a/comfy_api/internal/__init__.py +++ b/comfy_api/internal/__init__.py @@ -5,3 +5,146 @@ from .api_registry import ( register_versions as register_versions, get_all_versions as get_all_versions, ) + +import asyncio +from dataclasses import asdict +from typing import Callable, Optional + + +def first_real_override(cls: type, name: str, *, base: type=None) -> Optional[Callable]: + """Return the *callable* override of `name` visible on `cls`, or None if every + implementation up to (and including) `base` is the placeholder defined on `base`. + + If base is not provided, it will assume cls has a GET_BASE_CLASS + """ + if base is None: + if not hasattr(cls, "GET_BASE_CLASS"): + raise ValueError("base is required if cls does not have a GET_BASE_CLASS; is this a valid ComfyNode subclass?") + base = cls.GET_BASE_CLASS() + base_attr = getattr(base, name, None) + if base_attr is None: + return None + base_func = base_attr.__func__ + for c in cls.mro(): # NodeB, NodeA, ComfyNode, object … + if c is base: # reached the placeholder – we're done + break + if name in c.__dict__: # first class that *defines* the attr + func = getattr(c, name).__func__ + if func is not base_func: # real override + return getattr(cls, name) # bound to *cls* + return None + + +class _ComfyNodeInternal: + """Class that all V3-based APIs inherit from for ComfyNode. + + This is intended to only be referenced within execution.py, as it has to handle all V3 APIs going forward.""" + @classmethod + def GET_NODE_INFO_V1(cls): + ... + + +class _NodeOutputInternal: + """Class that all V3-based APIs inherit from for NodeOutput. + + This is intended to only be referenced within execution.py, as it has to handle all V3 APIs going forward.""" + ... + + +def as_pruned_dict(dataclass_obj): + '''Return dict of dataclass object with pruned None values.''' + return prune_dict(asdict(dataclass_obj)) + +def prune_dict(d: dict): + return {k: v for k,v in d.items() if v is not None} + + +def is_class(obj): + ''' + Returns True if is a class type. + Returns False if is a class instance. + ''' + return isinstance(obj, type) + + +def copy_class(cls: type) -> type: + ''' + Copy a class and its attributes. + ''' + if cls is None: + return None + cls_dict = { + k: v for k, v in cls.__dict__.items() + if k not in ('__dict__', '__weakref__', '__module__', '__doc__') + } + # new class + new_cls = type( + cls.__name__, + (cls,), + cls_dict + ) + # metadata preservation + new_cls.__module__ = cls.__module__ + new_cls.__doc__ = cls.__doc__ + return new_cls + + +class classproperty(object): + def __init__(self, f): + self.f = f + def __get__(self, obj, owner): + return self.f(owner) + + +# NOTE: this was ai generated and validated by hand +def shallow_clone_class(cls, new_name=None): + ''' + Shallow clone a class while preserving super() functionality. + ''' + new_name = new_name or f"{cls.__name__}Clone" + # Include the original class in the bases to maintain proper inheritance + new_bases = (cls,) + cls.__bases__ + return type(new_name, new_bases, dict(cls.__dict__)) + +# NOTE: this was ai generated and validated by hand +def lock_class(cls): + ''' + Lock a class so that its top-levelattributes cannot be modified. + ''' + # Locked instance __setattr__ + def locked_instance_setattr(self, name, value): + raise AttributeError( + f"Cannot set attribute '{name}' on immutable instance of {type(self).__name__}" + ) + # Locked metaclass + class LockedMeta(type(cls)): + def __setattr__(cls_, name, value): + raise AttributeError( + f"Cannot modify class attribute '{name}' on locked class '{cls_.__name__}'" + ) + # Rebuild class with locked behavior + locked_dict = dict(cls.__dict__) + locked_dict['__setattr__'] = locked_instance_setattr + + return LockedMeta(cls.__name__, cls.__bases__, locked_dict) + + +def make_locked_method_func(type_obj, func, class_clone): + """ + Returns a function that, when called with **inputs, will execute: + getattr(type_obj, func).__func__(lock_class(class_clone), **inputs) + + Supports both synchronous and asynchronous methods. + """ + locked_class = lock_class(class_clone) + method = getattr(type_obj, func).__func__ + + # Check if the original method is async + if asyncio.iscoroutinefunction(method): + async def wrapped_async_func(**inputs): + return await method(locked_class, **inputs) + return wrapped_async_func + else: + def wrapped_func(**inputs): + return method(locked_class, **inputs) + return wrapped_func diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py index e1f3a3655..2cee65aa9 100644 --- a/comfy_api/latest/__init__.py +++ b/comfy_api/latest/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from abc import ABC, abstractmethod from typing import Type, TYPE_CHECKING from comfy_api.internal import ComfyAPIBase from comfy_api.internal.singleton import ProxiedSingleton @@ -7,6 +8,9 @@ from comfy_api.internal.async_to_sync import create_sync_class from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents +from comfy_api.latest._io import _IO as io #noqa: F401 +from comfy_api.latest._ui import _UI as ui #noqa: F401 +# from comfy_api.latest._resources import _RESOURCES as resources #noqa: F401 from comfy_execution.utils import get_executing_context from comfy_execution.progress import get_progress_state, PreviewImageTuple from PIL import Image @@ -72,6 +76,19 @@ class ComfyAPI_latest(ComfyAPIBase): execution: Execution +class ComfyExtension(ABC): + async def on_load(self) -> None: + """ + Called when an extension is loaded. + This should be used to initialize any global resources neeeded by the extension. + """ + + @abstractmethod + async def get_node_list(self) -> list[type[io.ComfyNode]]: + """ + Returns a list of nodes that this extension provides. + """ + class Input: Image = ImageInput Audio = AudioInput @@ -103,4 +120,5 @@ __all__ = [ "Input", "InputImpl", "Types", + "ComfyExtension", ] diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py new file mode 100644 index 000000000..ec1efb51d --- /dev/null +++ b/comfy_api/latest/_io.py @@ -0,0 +1,1618 @@ +from __future__ import annotations + +import copy +import inspect +from abc import ABC, abstractmethod +from collections import Counter +from dataclasses import asdict, dataclass +from enum import Enum +from typing import Any, Callable, Literal, TypedDict, TypeVar, TYPE_CHECKING +from typing_extensions import NotRequired, final + +# used for type hinting +import torch + +if TYPE_CHECKING: + from spandrel import ImageModelDescriptor + from comfy.clip_vision import ClipVisionModel + from comfy.clip_vision import Output as ClipVisionOutput_ + from comfy.controlnet import ControlNet + from comfy.hooks import HookGroup, HookKeyframeGroup + from comfy.model_patcher import ModelPatcher + from comfy.samplers import CFGGuider, Sampler + from comfy.sd import CLIP, VAE + from comfy.sd import StyleModel as StyleModel_ + from comfy_api.input import VideoInput +from comfy_api.internal import (_ComfyNodeInternal, _NodeOutputInternal, classproperty, copy_class, first_real_override, is_class, + prune_dict, shallow_clone_class) +from comfy_api.latest._resources import Resources, ResourcesLocal +from comfy_execution.graph_utils import ExecutionBlocker + +# from comfy_extras.nodes_images import SVG as SVG_ # NOTE: needs to be moved before can be imported due to circular reference + +class FolderType(str, Enum): + input = "input" + output = "output" + temp = "temp" + + +class UploadType(str, Enum): + image = "image_upload" + audio = "audio_upload" + video = "video_upload" + model = "file_upload" + + +class RemoteOptions: + def __init__(self, route: str, refresh_button: bool, control_after_refresh: Literal["first", "last"]="first", + timeout: int=None, max_retries: int=None, refresh: int=None): + self.route = route + """The route to the remote source.""" + self.refresh_button = refresh_button + """Specifies whether to show a refresh button in the UI below the widget.""" + self.control_after_refresh = control_after_refresh + """Specifies the control after the refresh button is clicked. If "first", the first item will be automatically selected, and so on.""" + self.timeout = timeout + """The maximum amount of time to wait for a response from the remote source in milliseconds.""" + self.max_retries = max_retries + """The maximum number of retries before aborting the request.""" + self.refresh = refresh + """The TTL of the remote input's value in milliseconds. Specifies the interval at which the remote input's value is refreshed.""" + + def as_dict(self): + return prune_dict({ + "route": self.route, + "refresh_button": self.refresh_button, + "control_after_refresh": self.control_after_refresh, + "timeout": self.timeout, + "max_retries": self.max_retries, + "refresh": self.refresh, + }) + + +class NumberDisplay(str, Enum): + number = "number" + slider = "slider" + + +class _StringIOType(str): + def __ne__(self, value: object) -> bool: + if self == "*" or value == "*": + return False + if not isinstance(value, str): + return True + a = frozenset(self.split(",")) + b = frozenset(value.split(",")) + return not (b.issubset(a) or a.issubset(b)) + +class _ComfyType(ABC): + Type = Any + io_type: str = None + +# NOTE: this is a workaround to make the decorator return the correct type +T = TypeVar("T", bound=type) +def comfytype(io_type: str, **kwargs): + ''' + Decorator to mark nested classes as ComfyType; io_type will be bound to the class. + + A ComfyType may have the following attributes: + - Type = + - class Input(Input): ... + - class Output(Output): ... + ''' + def decorator(cls: T) -> T: + if isinstance(cls, _ComfyType) or issubclass(cls, _ComfyType): + # clone Input and Output classes to avoid modifying the original class + new_cls = cls + if hasattr(new_cls, "Input"): + new_cls.Input = copy_class(new_cls.Input) + if hasattr(new_cls, "Output"): + new_cls.Output = copy_class(new_cls.Output) + else: + # copy class attributes except for special ones that shouldn't be in type() + cls_dict = { + k: v for k, v in cls.__dict__.items() + if k not in ('__dict__', '__weakref__', '__module__', '__doc__') + } + # new class + new_cls: ComfyTypeIO = type( + cls.__name__, + (cls, ComfyTypeIO), + cls_dict + ) + # metadata preservation + new_cls.__module__ = cls.__module__ + new_cls.__doc__ = cls.__doc__ + # assign ComfyType attributes, if needed + # NOTE: use __ne__ trick for io_type (see node_typing.IO.__ne__ for details) + new_cls.io_type = _StringIOType(io_type) + if hasattr(new_cls, "Input") and new_cls.Input is not None: + new_cls.Input.Parent = new_cls + if hasattr(new_cls, "Output") and new_cls.Output is not None: + new_cls.Output.Parent = new_cls + return new_cls + return decorator + +def Custom(io_type: str) -> type[ComfyTypeIO]: + '''Create a ComfyType for a custom io_type.''' + @comfytype(io_type=io_type) + class CustomComfyType(ComfyTypeIO): + ... + return CustomComfyType + +class _IO_V3: + ''' + Base class for V3 Inputs and Outputs. + ''' + Parent: _ComfyType = None + + def __init__(self): + pass + + @property + def io_type(self): + return self.Parent.io_type + + @property + def Type(self): + return self.Parent.Type + +class Input(_IO_V3): + ''' + Base class for a V3 Input. + ''' + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None): + super().__init__() + self.id = id + self.display_name = display_name + self.optional = optional + self.tooltip = tooltip + self.lazy = lazy + self.extra_dict = extra_dict if extra_dict is not None else {} + + def as_dict(self): + return prune_dict({ + "display_name": self.display_name, + "optional": self.optional, + "tooltip": self.tooltip, + "lazy": self.lazy, + }) | prune_dict(self.extra_dict) + + def get_io_type(self): + return _StringIOType(self.io_type) + +class WidgetInput(Input): + ''' + Base class for a V3 Input with widget. + ''' + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + default: Any=None, + socketless: bool=None, widget_type: str=None, force_input: bool=None, extra_dict=None): + super().__init__(id, display_name, optional, tooltip, lazy, extra_dict) + self.default = default + self.socketless = socketless + self.widget_type = widget_type + self.force_input = force_input + + def as_dict(self): + return super().as_dict() | prune_dict({ + "default": self.default, + "socketless": self.socketless, + "widgetType": self.widget_type, + "forceInput": self.force_input, + }) + + def get_io_type(self): + return self.widget_type if self.widget_type is not None else super().get_io_type() + + +class Output(_IO_V3): + def __init__(self, id: str=None, display_name: str=None, tooltip: str=None, + is_output_list=False): + self.id = id + self.display_name = display_name + self.tooltip = tooltip + self.is_output_list = is_output_list + + def as_dict(self): + return prune_dict({ + "display_name": self.display_name, + "tooltip": self.tooltip, + "is_output_list": self.is_output_list, + }) + + def get_io_type(self): + return self.io_type + + +class ComfyTypeI(_ComfyType): + '''ComfyType subclass that only has a default Input class - intended for types that only have Inputs.''' + class Input(Input): + ... + +class ComfyTypeIO(ComfyTypeI): + '''ComfyType subclass that has default Input and Output classes; useful for types with both Inputs and Outputs.''' + class Output(Output): + ... + + +@comfytype(io_type="BOOLEAN") +class Boolean(ComfyTypeIO): + Type = bool + + class Input(WidgetInput): + '''Boolean input.''' + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + default: bool=None, label_on: str=None, label_off: str=None, + socketless: bool=None, force_input: bool=None): + super().__init__(id, display_name, optional, tooltip, lazy, default, socketless, None, force_input) + self.label_on = label_on + self.label_off = label_off + self.default: bool + + def as_dict(self): + return super().as_dict() | prune_dict({ + "label_on": self.label_on, + "label_off": self.label_off, + }) + +@comfytype(io_type="INT") +class Int(ComfyTypeIO): + Type = int + + class Input(WidgetInput): + '''Integer input.''' + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + default: int=None, min: int=None, max: int=None, step: int=None, control_after_generate: bool=None, + display_mode: NumberDisplay=None, socketless: bool=None, force_input: bool=None): + super().__init__(id, display_name, optional, tooltip, lazy, default, socketless, None, force_input) + self.min = min + self.max = max + self.step = step + self.control_after_generate = control_after_generate + self.display_mode = display_mode + self.default: int + + def as_dict(self): + return super().as_dict() | prune_dict({ + "min": self.min, + "max": self.max, + "step": self.step, + "control_after_generate": self.control_after_generate, + "display": self.display_mode.value if self.display_mode else None, + }) + +@comfytype(io_type="FLOAT") +class Float(ComfyTypeIO): + Type = float + + class Input(WidgetInput): + '''Float input.''' + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + default: float=None, min: float=None, max: float=None, step: float=None, round: float=None, + display_mode: NumberDisplay=None, socketless: bool=None, force_input: bool=None): + super().__init__(id, display_name, optional, tooltip, lazy, default, socketless, None, force_input) + self.min = min + self.max = max + self.step = step + self.round = round + self.display_mode = display_mode + self.default: float + + def as_dict(self): + return super().as_dict() | prune_dict({ + "min": self.min, + "max": self.max, + "step": self.step, + "round": self.round, + "display": self.display_mode, + }) + +@comfytype(io_type="STRING") +class String(ComfyTypeIO): + Type = str + + class Input(WidgetInput): + '''String input.''' + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + multiline=False, placeholder: str=None, default: str=None, dynamic_prompts: bool=None, + socketless: bool=None, force_input: bool=None): + super().__init__(id, display_name, optional, tooltip, lazy, default, socketless, None, force_input) + self.multiline = multiline + self.placeholder = placeholder + self.dynamic_prompts = dynamic_prompts + self.default: str + + def as_dict(self): + return super().as_dict() | prune_dict({ + "multiline": self.multiline, + "placeholder": self.placeholder, + "dynamicPrompts": self.dynamic_prompts, + }) + +@comfytype(io_type="COMBO") +class Combo(ComfyTypeI): + Type = str + class Input(WidgetInput): + """Combo input (dropdown).""" + Type = str + def __init__(self, id: str, options: list[str]=None, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + default: str=None, control_after_generate: bool=None, + upload: UploadType=None, image_folder: FolderType=None, + remote: RemoteOptions=None, + socketless: bool=None): + super().__init__(id, display_name, optional, tooltip, lazy, default, socketless) + self.multiselect = False + self.options = options + self.control_after_generate = control_after_generate + self.upload = upload + self.image_folder = image_folder + self.remote = remote + self.default: str + + def as_dict(self): + return super().as_dict() | prune_dict({ + "multiselect": self.multiselect, + "options": self.options, + "control_after_generate": self.control_after_generate, + **({self.upload.value: True} if self.upload is not None else {}), + "image_folder": self.image_folder.value if self.image_folder else None, + "remote": self.remote.as_dict() if self.remote else None, + }) + + +@comfytype(io_type="COMBO") +class MultiCombo(ComfyTypeI): + '''Multiselect Combo input (dropdown for selecting potentially more than one value).''' + # TODO: something is wrong with the serialization, frontend does not recognize it as multiselect + Type = list[str] + class Input(Combo.Input): + def __init__(self, id: str, options: list[str], display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, + default: list[str]=None, placeholder: str=None, chip: bool=None, control_after_generate: bool=None, + socketless: bool=None): + super().__init__(id, options, display_name, optional, tooltip, lazy, default, control_after_generate, socketless=socketless) + self.multiselect = True + self.placeholder = placeholder + self.chip = chip + self.default: list[str] + + def as_dict(self): + to_return = super().as_dict() | prune_dict({ + "multi_select": self.multiselect, + "placeholder": self.placeholder, + "chip": self.chip, + }) + return to_return + +@comfytype(io_type="IMAGE") +class Image(ComfyTypeIO): + Type = torch.Tensor + + +@comfytype(io_type="WAN_CAMERA_EMBEDDING") +class WanCameraEmbedding(ComfyTypeIO): + Type = torch.Tensor + + +@comfytype(io_type="WEBCAM") +class Webcam(ComfyTypeIO): + Type = str + + class Input(WidgetInput): + """Webcam input.""" + Type = str + def __init__( + self, id: str, display_name: str=None, optional=False, + tooltip: str=None, lazy: bool=None, default: str=None, socketless: bool=None + ): + super().__init__(id, display_name, optional, tooltip, lazy, default, socketless) + + +@comfytype(io_type="MASK") +class Mask(ComfyTypeIO): + Type = torch.Tensor + +@comfytype(io_type="LATENT") +class Latent(ComfyTypeIO): + '''Latents are stored as a dictionary.''' + class LatentDict(TypedDict): + samples: torch.Tensor + '''Latent tensors.''' + noise_mask: NotRequired[torch.Tensor] + batch_index: NotRequired[list[int]] + type: NotRequired[str] + '''Only needed if dealing with these types: audio, hunyuan3dv2''' + Type = LatentDict + +@comfytype(io_type="CONDITIONING") +class Conditioning(ComfyTypeIO): + class PooledDict(TypedDict): + pooled_output: torch.Tensor + '''Pooled output from CLIP.''' + control: NotRequired[ControlNet] + '''ControlNet to apply to conditioning.''' + control_apply_to_uncond: NotRequired[bool] + '''Whether to apply ControlNet to matching negative conditioning at sample time, if applicable.''' + cross_attn_controlnet: NotRequired[torch.Tensor] + '''CrossAttn from CLIP to use for controlnet only.''' + pooled_output_controlnet: NotRequired[torch.Tensor] + '''Pooled output from CLIP to use for controlnet only.''' + gligen: NotRequired[tuple[str, Gligen, list[tuple[torch.Tensor, int, ...]]]] + '''GLIGEN to apply to conditioning.''' + area: NotRequired[tuple[int, ...] | tuple[str, float, ...]] + '''Set area of conditioning. First half of values apply to dimensions, the second half apply to coordinates. + By default, the dimensions are based on total pixel amount, but the first value can be set to "percentage" to use a percentage of the image size instead. + + (1024, 1024, 0, 0) would apply conditioning to the top-left 1024x1024 pixels. + + ("percentage", 0.5, 0.5, 0, 0) would apply conditioning to the top-left 50% of the image.''' # TODO: verify its actually top-left + strength: NotRequired[float] + '''Strength of conditioning. Default strength is 1.0.''' + mask: NotRequired[torch.Tensor] + '''Mask to apply conditioning to.''' + mask_strength: NotRequired[float] + '''Strength of conditioning mask. Default strength is 1.0.''' + set_area_to_bounds: NotRequired[bool] + '''Whether conditioning mask should determine bounds of area - if set to false, latents are sampled at full resolution and result is applied in mask.''' + concat_latent_image: NotRequired[torch.Tensor] + '''Used for inpainting and specific models.''' + concat_mask: NotRequired[torch.Tensor] + '''Used for inpainting and specific models.''' + concat_image: NotRequired[torch.Tensor] + '''Used by SD_4XUpscale_Conditioning.''' + noise_augmentation: NotRequired[float] + '''Used by SD_4XUpscale_Conditioning.''' + hooks: NotRequired[HookGroup] + '''Applies hooks to conditioning.''' + default: NotRequired[bool] + '''Whether to this conditioning is 'default'; default conditioning gets applied to any areas of the image that have no masks/areas applied, assuming at least one area/mask is present during sampling.''' + start_percent: NotRequired[float] + '''Determines relative step to begin applying conditioning, expressed as a float between 0.0 and 1.0.''' + end_percent: NotRequired[float] + '''Determines relative step to end applying conditioning, expressed as a float between 0.0 and 1.0.''' + clip_start_percent: NotRequired[float] + '''Internal variable for conditioning scheduling - start of application, expressed as a float between 0.0 and 1.0.''' + clip_end_percent: NotRequired[float] + '''Internal variable for conditioning scheduling - end of application, expressed as a float between 0.0 and 1.0.''' + attention_mask: NotRequired[torch.Tensor] + '''Masks text conditioning; used by StyleModel among others.''' + attention_mask_img_shape: NotRequired[tuple[int, ...]] + '''Masks text conditioning; used by StyleModel among others.''' + unclip_conditioning: NotRequired[list[dict]] + '''Used by unCLIP.''' + conditioning_lyrics: NotRequired[torch.Tensor] + '''Used by AceT5Model.''' + seconds_start: NotRequired[float] + '''Used by StableAudio.''' + seconds_total: NotRequired[float] + '''Used by StableAudio.''' + lyrics_strength: NotRequired[float] + '''Used by AceStepAudio.''' + width: NotRequired[int] + '''Used by certain models (e.g. CLIPTextEncodeSDXL/Refiner, PixArtAlpha).''' + height: NotRequired[int] + '''Used by certain models (e.g. CLIPTextEncodeSDXL/Refiner, PixArtAlpha).''' + aesthetic_score: NotRequired[float] + '''Used by CLIPTextEncodeSDXL/Refiner.''' + crop_w: NotRequired[int] + '''Used by CLIPTextEncodeSDXL.''' + crop_h: NotRequired[int] + '''Used by CLIPTextEncodeSDXL.''' + target_width: NotRequired[int] + '''Used by CLIPTextEncodeSDXL.''' + target_height: NotRequired[int] + '''Used by CLIPTextEncodeSDXL.''' + reference_latents: NotRequired[list[torch.Tensor]] + '''Used by ReferenceLatent.''' + guidance: NotRequired[float] + '''Used by Flux-like models with guidance embed.''' + guiding_frame_index: NotRequired[int] + '''Used by Hunyuan ImageToVideo.''' + ref_latent: NotRequired[torch.Tensor] + '''Used by Hunyuan ImageToVideo.''' + keyframe_idxs: NotRequired[list[int]] + '''Used by LTXV.''' + frame_rate: NotRequired[float] + '''Used by LTXV.''' + stable_cascade_prior: NotRequired[torch.Tensor] + '''Used by StableCascade.''' + elevation: NotRequired[list[float]] + '''Used by SV3D.''' + azimuth: NotRequired[list[float]] + '''Used by SV3D.''' + motion_bucket_id: NotRequired[int] + '''Used by SVD-like models.''' + fps: NotRequired[int] + '''Used by SVD-like models.''' + augmentation_level: NotRequired[float] + '''Used by SVD-like models.''' + clip_vision_output: NotRequired[ClipVisionOutput_] + '''Used by WAN-like models.''' + vace_frames: NotRequired[torch.Tensor] + '''Used by WAN VACE.''' + vace_mask: NotRequired[torch.Tensor] + '''Used by WAN VACE.''' + vace_strength: NotRequired[float] + '''Used by WAN VACE.''' + camera_conditions: NotRequired[Any] # TODO: assign proper type once defined + '''Used by WAN Camera.''' + time_dim_concat: NotRequired[torch.Tensor] + '''Used by WAN Phantom Subject.''' + + CondList = list[tuple[torch.Tensor, PooledDict]] + Type = CondList + +@comfytype(io_type="SAMPLER") +class Sampler(ComfyTypeIO): + if TYPE_CHECKING: + Type = Sampler + +@comfytype(io_type="SIGMAS") +class Sigmas(ComfyTypeIO): + Type = torch.Tensor + +@comfytype(io_type="NOISE") +class Noise(ComfyTypeIO): + Type = torch.Tensor + +@comfytype(io_type="GUIDER") +class Guider(ComfyTypeIO): + if TYPE_CHECKING: + Type = CFGGuider + +@comfytype(io_type="CLIP") +class Clip(ComfyTypeIO): + if TYPE_CHECKING: + Type = CLIP + +@comfytype(io_type="CONTROL_NET") +class ControlNet(ComfyTypeIO): + if TYPE_CHECKING: + Type = ControlNet + +@comfytype(io_type="VAE") +class Vae(ComfyTypeIO): + if TYPE_CHECKING: + Type = VAE + +@comfytype(io_type="MODEL") +class Model(ComfyTypeIO): + if TYPE_CHECKING: + Type = ModelPatcher + +@comfytype(io_type="CLIP_VISION") +class ClipVision(ComfyTypeIO): + if TYPE_CHECKING: + Type = ClipVisionModel + +@comfytype(io_type="CLIP_VISION_OUTPUT") +class ClipVisionOutput(ComfyTypeIO): + if TYPE_CHECKING: + Type = ClipVisionOutput_ + +@comfytype(io_type="STYLE_MODEL") +class StyleModel(ComfyTypeIO): + if TYPE_CHECKING: + Type = StyleModel_ + +@comfytype(io_type="GLIGEN") +class Gligen(ComfyTypeIO): + '''ModelPatcher that wraps around a 'Gligen' model.''' + if TYPE_CHECKING: + Type = ModelPatcher + +@comfytype(io_type="UPSCALE_MODEL") +class UpscaleModel(ComfyTypeIO): + if TYPE_CHECKING: + Type = ImageModelDescriptor + +@comfytype(io_type="AUDIO") +class Audio(ComfyTypeIO): + class AudioDict(TypedDict): + waveform: torch.Tensor + sampler_rate: int + Type = AudioDict + +@comfytype(io_type="VIDEO") +class Video(ComfyTypeIO): + if TYPE_CHECKING: + Type = VideoInput + +@comfytype(io_type="SVG") +class SVG(ComfyTypeIO): + Type = Any # TODO: SVG class is defined in comfy_extras/nodes_images.py, causing circular reference; should be moved to somewhere else before referenced directly in v3 + +@comfytype(io_type="LORA_MODEL") +class LoraModel(ComfyTypeIO): + Type = dict[str, torch.Tensor] + +@comfytype(io_type="LOSS_MAP") +class LossMap(ComfyTypeIO): + class LossMapDict(TypedDict): + loss: list[torch.Tensor] + Type = LossMapDict + +@comfytype(io_type="VOXEL") +class Voxel(ComfyTypeIO): + Type = Any # TODO: VOXEL class is defined in comfy_extras/nodes_hunyuan3d.py; should be moved to somewhere else before referenced directly in v3 + +@comfytype(io_type="MESH") +class Mesh(ComfyTypeIO): + Type = Any # TODO: MESH class is defined in comfy_extras/nodes_hunyuan3d.py; should be moved to somewhere else before referenced directly in v3 + +@comfytype(io_type="HOOKS") +class Hooks(ComfyTypeIO): + if TYPE_CHECKING: + Type = HookGroup + +@comfytype(io_type="HOOK_KEYFRAMES") +class HookKeyframes(ComfyTypeIO): + if TYPE_CHECKING: + Type = HookKeyframeGroup + +@comfytype(io_type="TIMESTEPS_RANGE") +class TimestepsRange(ComfyTypeIO): + '''Range defined by start and endpoint, between 0.0 and 1.0.''' + Type = tuple[int, int] + +@comfytype(io_type="LATENT_OPERATION") +class LatentOperation(ComfyTypeIO): + Type = Callable[[torch.Tensor], torch.Tensor] + +@comfytype(io_type="FLOW_CONTROL") +class FlowControl(ComfyTypeIO): + # NOTE: only used in testing_nodes right now + Type = tuple[str, Any] + +@comfytype(io_type="ACCUMULATION") +class Accumulation(ComfyTypeIO): + # NOTE: only used in testing_nodes right now + class AccumulationDict(TypedDict): + accum: list[Any] + Type = AccumulationDict + + +@comfytype(io_type="LOAD3D_CAMERA") +class Load3DCamera(ComfyTypeIO): + class CameraInfo(TypedDict): + position: dict[str, float | int] + target: dict[str, float | int] + zoom: int + cameraType: str + + Type = CameraInfo + + +@comfytype(io_type="LOAD_3D") +class Load3D(ComfyTypeIO): + """3D models are stored as a dictionary.""" + class Model3DDict(TypedDict): + image: str + mask: str + normal: str + camera_info: Load3DCamera.CameraInfo + recording: NotRequired[str] + + Type = Model3DDict + + +@comfytype(io_type="LOAD_3D_ANIMATION") +class Load3DAnimation(Load3D): + ... + + +@comfytype(io_type="PHOTOMAKER") +class Photomaker(ComfyTypeIO): + Type = Any + + +@comfytype(io_type="POINT") +class Point(ComfyTypeIO): + Type = Any # NOTE: I couldn't find any references in core code to POINT io_type. Does this exist? + +@comfytype(io_type="FACE_ANALYSIS") +class FaceAnalysis(ComfyTypeIO): + Type = Any # NOTE: I couldn't find any references in core code to POINT io_type. Does this exist? + +@comfytype(io_type="BBOX") +class BBOX(ComfyTypeIO): + Type = Any # NOTE: I couldn't find any references in core code to POINT io_type. Does this exist? + +@comfytype(io_type="SEGS") +class SEGS(ComfyTypeIO): + Type = Any # NOTE: I couldn't find any references in core code to POINT io_type. Does this exist? + +@comfytype(io_type="*") +class AnyType(ComfyTypeIO): + Type = Any + +@comfytype(io_type="COMFY_MULTITYPED_V3") +class MultiType: + Type = Any + class Input(Input): + ''' + Input that permits more than one input type; if `id` is an instance of `ComfyType.Input`, then that input will be used to create a widget (if applicable) with overridden values. + ''' + def __init__(self, id: str | Input, types: list[type[_ComfyType] | _ComfyType], display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None): + # if id is an Input, then use that Input with overridden values + self.input_override = None + if isinstance(id, Input): + self.input_override = copy.copy(id) + optional = id.optional if id.optional is True else optional + tooltip = id.tooltip if id.tooltip is not None else tooltip + display_name = id.display_name if id.display_name is not None else display_name + lazy = id.lazy if id.lazy is not None else lazy + id = id.id + # if is a widget input, make sure widget_type is set appropriately + if isinstance(self.input_override, WidgetInput): + self.input_override.widget_type = self.input_override.get_io_type() + super().__init__(id, display_name, optional, tooltip, lazy, extra_dict) + self._io_types = types + + @property + def io_types(self) -> list[type[Input]]: + ''' + Returns list of Input class types permitted. + ''' + io_types = [] + for x in self._io_types: + if not is_class(x): + io_types.append(type(x)) + else: + io_types.append(x) + return io_types + + def get_io_type(self): + # ensure types are unique and order is preserved + str_types = [x.io_type for x in self.io_types] + if self.input_override is not None: + str_types.insert(0, self.input_override.get_io_type()) + return ",".join(list(dict.fromkeys(str_types))) + + def as_dict(self): + if self.input_override is not None: + return self.input_override.as_dict() | super().as_dict() + else: + return super().as_dict() + +class DynamicInput(Input, ABC): + ''' + Abstract class for dynamic input registration. + ''' + @abstractmethod + def get_dynamic(self) -> list[Input]: + ... + +class DynamicOutput(Output, ABC): + ''' + Abstract class for dynamic output registration. + ''' + def __init__(self, id: str=None, display_name: str=None, tooltip: str=None, + is_output_list=False): + super().__init__(id, display_name, tooltip, is_output_list) + + @abstractmethod + def get_dynamic(self) -> list[Output]: + ... + + +@comfytype(io_type="COMFY_AUTOGROW_V3") +class AutogrowDynamic(ComfyTypeI): + Type = list[Any] + class Input(DynamicInput): + def __init__(self, id: str, template_input: Input, min: int=1, max: int=None, + display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None): + super().__init__(id, display_name, optional, tooltip, lazy, extra_dict) + self.template_input = template_input + if min is not None: + assert(min >= 1) + if max is not None: + assert(max >= 1) + self.min = min + self.max = max + + def get_dynamic(self) -> list[Input]: + curr_count = 1 + new_inputs = [] + for i in range(self.min): + new_input = copy.copy(self.template_input) + new_input.id = f"{new_input.id}{curr_count}_${self.id}_ag$" + if new_input.display_name is not None: + new_input.display_name = f"{new_input.display_name}{curr_count}" + new_input.optional = self.optional or new_input.optional + if isinstance(self.template_input, WidgetInput): + new_input.force_input = True + new_inputs.append(new_input) + curr_count += 1 + # pretend to expand up to max + for i in range(curr_count-1, self.max): + new_input = copy.copy(self.template_input) + new_input.id = f"{new_input.id}{curr_count}_${self.id}_ag$" + if new_input.display_name is not None: + new_input.display_name = f"{new_input.display_name}{curr_count}" + new_input.optional = True + if isinstance(self.template_input, WidgetInput): + new_input.force_input = True + new_inputs.append(new_input) + curr_count += 1 + return new_inputs + +@comfytype(io_type="COMFY_COMBODYNAMIC_V3") +class ComboDynamic(ComfyTypeI): + class Input(DynamicInput): + def __init__(self, id: str): + pass + +@comfytype(io_type="COMFY_MATCHTYPE_V3") +class MatchType(ComfyTypeIO): + class Template: + def __init__(self, template_id: str, allowed_types: _ComfyType | list[_ComfyType]): + self.template_id = template_id + self.allowed_types = [allowed_types] if isinstance(allowed_types, _ComfyType) else allowed_types + + def as_dict(self): + return { + "template_id": self.template_id, + "allowed_types": "".join(t.io_type for t in self.allowed_types), + } + + class Input(DynamicInput): + def __init__(self, id: str, template: MatchType.Template, + display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None): + super().__init__(id, display_name, optional, tooltip, lazy, extra_dict) + self.template = template + + def get_dynamic(self) -> list[Input]: + return [self] + + def as_dict(self): + return super().as_dict() | prune_dict({ + "template": self.template.as_dict(), + }) + + class Output(DynamicOutput): + def __init__(self, id: str, template: MatchType.Template, display_name: str=None, tooltip: str=None, + is_output_list=False): + super().__init__(id, display_name, tooltip, is_output_list) + self.template = template + + def get_dynamic(self) -> list[Output]: + return [self] + + def as_dict(self): + return super().as_dict() | prune_dict({ + "template": self.template.as_dict(), + }) + + +class HiddenHolder: + def __init__(self, unique_id: str, prompt: Any, + extra_pnginfo: Any, dynprompt: Any, + auth_token_comfy_org: str, api_key_comfy_org: str, **kwargs): + self.unique_id = unique_id + """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages).""" + self.prompt = prompt + """PROMPT is the complete prompt sent by the client to the server. See the prompt object for a full description.""" + self.extra_pnginfo = extra_pnginfo + """EXTRA_PNGINFO is a dictionary that will be copied into the metadata of any .png files saved. Custom nodes can store additional information in this dictionary for saving (or as a way to communicate with a downstream node).""" + self.dynprompt = dynprompt + """DYNPROMPT is an instance of comfy_execution.graph.DynamicPrompt. It differs from PROMPT in that it may mutate during the course of execution in response to Node Expansion.""" + self.auth_token_comfy_org = auth_token_comfy_org + """AUTH_TOKEN_COMFY_ORG is a token acquired from signing into a ComfyOrg account on frontend.""" + self.api_key_comfy_org = api_key_comfy_org + """API_KEY_COMFY_ORG is an API Key generated by ComfyOrg that allows skipping signing into a ComfyOrg account on frontend.""" + + def __getattr__(self, key: str): + '''If hidden variable not found, return None.''' + return None + + @classmethod + def from_dict(cls, d: dict | None): + if d is None: + d = {} + return cls( + unique_id=d.get(Hidden.unique_id, None), + prompt=d.get(Hidden.prompt, None), + extra_pnginfo=d.get(Hidden.extra_pnginfo, None), + dynprompt=d.get(Hidden.dynprompt, None), + auth_token_comfy_org=d.get(Hidden.auth_token_comfy_org, None), + api_key_comfy_org=d.get(Hidden.api_key_comfy_org, None), + ) + +class Hidden(str, Enum): + ''' + Enumerator for requesting hidden variables in nodes. + ''' + unique_id = "UNIQUE_ID" + """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages).""" + prompt = "PROMPT" + """PROMPT is the complete prompt sent by the client to the server. See the prompt object for a full description.""" + extra_pnginfo = "EXTRA_PNGINFO" + """EXTRA_PNGINFO is a dictionary that will be copied into the metadata of any .png files saved. Custom nodes can store additional information in this dictionary for saving (or as a way to communicate with a downstream node).""" + dynprompt = "DYNPROMPT" + """DYNPROMPT is an instance of comfy_execution.graph.DynamicPrompt. It differs from PROMPT in that it may mutate during the course of execution in response to Node Expansion.""" + auth_token_comfy_org = "AUTH_TOKEN_COMFY_ORG" + """AUTH_TOKEN_COMFY_ORG is a token acquired from signing into a ComfyOrg account on frontend.""" + api_key_comfy_org = "API_KEY_COMFY_ORG" + """API_KEY_COMFY_ORG is an API Key generated by ComfyOrg that allows skipping signing into a ComfyOrg account on frontend.""" + + +@dataclass +class NodeInfoV1: + input: dict=None + input_order: dict[str, list[str]]=None + output: list[str]=None + output_is_list: list[bool]=None + output_name: list[str]=None + output_tooltips: list[str]=None + name: str=None + display_name: str=None + description: str=None + python_module: Any=None + category: str=None + output_node: bool=None + deprecated: bool=None + experimental: bool=None + api_node: bool=None + +@dataclass +class NodeInfoV3: + input: dict=None + output: dict=None + hidden: list[str]=None + name: str=None + display_name: str=None + description: str=None + category: str=None + output_node: bool=None + deprecated: bool=None + experimental: bool=None + api_node: bool=None + + +@dataclass +class Schema: + """Definition of V3 node properties.""" + + node_id: str + """ID of node - should be globally unique. If this is a custom node, add a prefix or postfix to avoid name clashes.""" + display_name: str = None + """Display name of node.""" + category: str = "sd" + """The category of the node, as per the "Add Node" menu.""" + inputs: list[Input]=None + outputs: list[Output]=None + hidden: list[Hidden]=None + description: str="" + """Node description, shown as a tooltip when hovering over the node.""" + is_input_list: bool = False + """A flag indicating if this node implements the additional code necessary to deal with OUTPUT_IS_LIST nodes. + + All inputs of ``type`` will become ``list[type]``, regardless of how many items are passed in. This also affects ``check_lazy_status``. + + From the docs: + + A node can also override the default input behaviour and receive the whole list in a single call. This is done by setting a class attribute `INPUT_IS_LIST` to ``True``. + + Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing + """ + is_output_node: bool=False + """Flags this node as an output node, causing any inputs it requires to be executed. + + If a node is not connected to any output nodes, that node will not be executed. Usage:: + + From the docs: + + By default, a node is not considered an output. Set ``OUTPUT_NODE = True`` to specify that it is. + + Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#output-node + """ + is_deprecated: bool=False + """Flags a node as deprecated, indicating to users that they should find alternatives to this node.""" + is_experimental: bool=False + """Flags a node as experimental, informing users that it may change or not work as expected.""" + is_api_node: bool=False + """Flags a node as an API node. See: https://docs.comfy.org/tutorials/api-nodes/overview.""" + not_idempotent: bool=False + """Flags a node as not idempotent; when True, the node will run and not reuse the cached outputs when identical inputs are provided on a different node in the graph.""" + enable_expand: bool=False + """Flags a node as expandable, allowing NodeOutput to include 'expand' property.""" + + def validate(self): + '''Validate the schema: + - verify ids on inputs and outputs are unique - both internally and in relation to each other + ''' + input_ids = [i.id for i in self.inputs] if self.inputs is not None else [] + output_ids = [o.id for o in self.outputs] if self.outputs is not None else [] + input_set = set(input_ids) + output_set = set(output_ids) + issues = [] + # verify ids are unique per list + if len(input_set) != len(input_ids): + issues.append(f"Input ids must be unique, but {[item for item, count in Counter(input_ids).items() if count > 1]} are not.") + if len(output_set) != len(output_ids): + issues.append(f"Output ids must be unique, but {[item for item, count in Counter(output_ids).items() if count > 1]} are not.") + # verify ids are unique between lists + intersection = input_set & output_set + if len(intersection) > 0: + issues.append(f"Ids must be unique between inputs and outputs, but {intersection} are not.") + if len(issues) > 0: + raise ValueError("\n".join(issues)) + + def finalize(self): + """Add hidden based on selected schema options, and give outputs without ids default ids.""" + # if is an api_node, will need key-related hidden + if self.is_api_node: + if self.hidden is None: + self.hidden = [] + if Hidden.auth_token_comfy_org not in self.hidden: + self.hidden.append(Hidden.auth_token_comfy_org) + if Hidden.api_key_comfy_org not in self.hidden: + self.hidden.append(Hidden.api_key_comfy_org) + # if is an output_node, will need prompt and extra_pnginfo + if self.is_output_node: + if self.hidden is None: + self.hidden = [] + if Hidden.prompt not in self.hidden: + self.hidden.append(Hidden.prompt) + if Hidden.extra_pnginfo not in self.hidden: + self.hidden.append(Hidden.extra_pnginfo) + # give outputs without ids default ids + if self.outputs is not None: + for i, output in enumerate(self.outputs): + if output.id is None: + output.id = f"_{i}_{output.io_type}_" + + def get_v1_info(self, cls) -> NodeInfoV1: + # get V1 inputs + input = { + "required": {} + } + if self.inputs: + for i in self.inputs: + if isinstance(i, DynamicInput): + dynamic_inputs = i.get_dynamic() + for d in dynamic_inputs: + add_to_dict_v1(d, input) + else: + add_to_dict_v1(i, input) + if self.hidden: + for hidden in self.hidden: + input.setdefault("hidden", {})[hidden.name] = (hidden.value,) + # create separate lists from output fields + output = [] + output_is_list = [] + output_name = [] + output_tooltips = [] + if self.outputs: + for o in self.outputs: + output.append(o.io_type) + output_is_list.append(o.is_output_list) + output_name.append(o.display_name if o.display_name else o.io_type) + output_tooltips.append(o.tooltip if o.tooltip else None) + + info = NodeInfoV1( + input=input, + input_order={key: list(value.keys()) for (key, value) in input.items()}, + output=output, + output_is_list=output_is_list, + output_name=output_name, + output_tooltips=output_tooltips, + name=self.node_id, + display_name=self.display_name, + category=self.category, + description=self.description, + output_node=self.is_output_node, + deprecated=self.is_deprecated, + experimental=self.is_experimental, + api_node=self.is_api_node, + python_module=getattr(cls, "RELATIVE_PYTHON_MODULE", "nodes") + ) + return info + + + def get_v3_info(self, cls) -> NodeInfoV3: + input_dict = {} + output_dict = {} + hidden_list = [] + # TODO: make sure dynamic types will be handled correctly + if self.inputs: + for input in self.inputs: + add_to_dict_v3(input, input_dict) + if self.outputs: + for output in self.outputs: + add_to_dict_v3(output, output_dict) + if self.hidden: + for hidden in self.hidden: + hidden_list.append(hidden.value) + + info = NodeInfoV3( + input=input_dict, + output=output_dict, + hidden=hidden_list, + name=self.node_id, + display_name=self.display_name, + description=self.description, + category=self.category, + output_node=self.is_output_node, + deprecated=self.is_deprecated, + experimental=self.is_experimental, + api_node=self.is_api_node, + python_module=getattr(cls, "RELATIVE_PYTHON_MODULE", "nodes") + ) + return info + + +def add_to_dict_v1(i: Input, input: dict): + key = "optional" if i.optional else "required" + as_dict = i.as_dict() + # for v1, we don't want to include the optional key + as_dict.pop("optional", None) + input.setdefault(key, {})[i.id] = (i.get_io_type(), as_dict) + +def add_to_dict_v3(io: Input | Output, d: dict): + d[io.id] = (io.get_io_type(), io.as_dict()) + + + +class _ComfyNodeBaseInternal(_ComfyNodeInternal): + """Common base class for storing internal methods and properties; DO NOT USE for defining nodes.""" + + RELATIVE_PYTHON_MODULE = None + SCHEMA = None + + # filled in during execution + resources: Resources = None + hidden: HiddenHolder = None + + @classmethod + @abstractmethod + def define_schema(cls) -> Schema: + """Override this function with one that returns a Schema instance.""" + raise NotImplementedError + + @classmethod + @abstractmethod + def execute(cls, **kwargs) -> NodeOutput: + """Override this function with one that performs node's actions.""" + raise NotImplementedError + + @classmethod + def validate_inputs(cls, **kwargs) -> bool: + """Optionally, define this function to validate inputs; equivalent to V1's VALIDATE_INPUTS.""" + raise NotImplementedError + + @classmethod + def fingerprint_inputs(cls, **kwargs) -> Any: + """Optionally, define this function to fingerprint inputs; equivalent to V1's IS_CHANGED.""" + raise NotImplementedError + + @classmethod + def check_lazy_status(cls, **kwargs) -> list[str]: + """Optionally, define this function to return a list of input names that should be evaluated. + + This basic mixin impl. requires all inputs. + + :kwargs: All node inputs will be included here. If the input is ``None``, it should be assumed that it has not yet been evaluated. \ + When using ``INPUT_IS_LIST = True``, unevaluated will instead be ``(None,)``. + + Params should match the nodes execution ``FUNCTION`` (self, and all inputs by name). + Will be executed repeatedly until it returns an empty list, or all requested items were already evaluated (and sent as params). + + Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lazy_evaluation#defining-check-lazy-status + """ + return [name for name in kwargs if kwargs[name] is None] + + def __init__(self): + self.local_resources: ResourcesLocal = None + self.__class__.VALIDATE_CLASS() + + @classmethod + def GET_BASE_CLASS(cls): + return _ComfyNodeBaseInternal + + @final + @classmethod + def VALIDATE_CLASS(cls): + if first_real_override(cls, "define_schema") is None: + raise Exception(f"No define_schema function was defined for node class {cls.__name__}.") + if first_real_override(cls, "execute") is None: + raise Exception(f"No execute function was defined for node class {cls.__name__}.") + + @classproperty + def FUNCTION(cls): # noqa + if inspect.iscoroutinefunction(cls.execute): + return "EXECUTE_NORMALIZED_ASYNC" + return "EXECUTE_NORMALIZED" + + @final + @classmethod + def EXECUTE_NORMALIZED(cls, *args, **kwargs) -> NodeOutput: + to_return = cls.execute(*args, **kwargs) + if to_return is None: + to_return = NodeOutput() + elif isinstance(to_return, NodeOutput): + pass + elif isinstance(to_return, tuple): + to_return = NodeOutput(*to_return) + elif isinstance(to_return, dict): + to_return = NodeOutput.from_dict(to_return) + elif isinstance(to_return, ExecutionBlocker): + to_return = NodeOutput(block_execution=to_return.message) + else: + raise Exception(f"Invalid return type from node: {type(to_return)}") + if to_return.expand is not None and not cls.SCHEMA.enable_expand: + raise Exception(f"Node {cls.__name__} is not expandable, but expand included in NodeOutput; developer should set enable_expand=True on node's Schema to allow this.") + return to_return + + @final + @classmethod + async def EXECUTE_NORMALIZED_ASYNC(cls, *args, **kwargs) -> NodeOutput: + to_return = await cls.execute(*args, **kwargs) + if to_return is None: + to_return = NodeOutput() + elif isinstance(to_return, NodeOutput): + pass + elif isinstance(to_return, tuple): + to_return = NodeOutput(*to_return) + elif isinstance(to_return, dict): + to_return = NodeOutput.from_dict(to_return) + elif isinstance(to_return, ExecutionBlocker): + to_return = NodeOutput(block_execution=to_return.message) + else: + raise Exception(f"Invalid return type from node: {type(to_return)}") + if to_return.expand is not None and not cls.SCHEMA.enable_expand: + raise Exception(f"Node {cls.__name__} is not expandable, but expand included in NodeOutput; developer should set enable_expand=True on node's Schema to allow this.") + return to_return + + @final + @classmethod + def PREPARE_CLASS_CLONE(cls, hidden_inputs: dict) -> type[ComfyNode]: + """Creates clone of real node class to prevent monkey-patching.""" + c_type: type[ComfyNode] = cls if is_class(cls) else type(cls) + type_clone: type[ComfyNode] = shallow_clone_class(c_type) + # set hidden + type_clone.hidden = HiddenHolder.from_dict(hidden_inputs) + return type_clone + + @final + @classmethod + def GET_NODE_INFO_V3(cls) -> dict[str, Any]: + schema = cls.GET_SCHEMA() + info = schema.get_v3_info(cls) + return asdict(info) + ############################################# + # V1 Backwards Compatibility code + #-------------------------------------------- + @final + @classmethod + def GET_NODE_INFO_V1(cls) -> dict[str, Any]: + schema = cls.GET_SCHEMA() + info = schema.get_v1_info(cls) + return asdict(info) + + _DESCRIPTION = None + @final + @classproperty + def DESCRIPTION(cls): # noqa + if cls._DESCRIPTION is None: + cls.GET_SCHEMA() + return cls._DESCRIPTION + + _CATEGORY = None + @final + @classproperty + def CATEGORY(cls): # noqa + if cls._CATEGORY is None: + cls.GET_SCHEMA() + return cls._CATEGORY + + _EXPERIMENTAL = None + @final + @classproperty + def EXPERIMENTAL(cls): # noqa + if cls._EXPERIMENTAL is None: + cls.GET_SCHEMA() + return cls._EXPERIMENTAL + + _DEPRECATED = None + @final + @classproperty + def DEPRECATED(cls): # noqa + if cls._DEPRECATED is None: + cls.GET_SCHEMA() + return cls._DEPRECATED + + _API_NODE = None + @final + @classproperty + def API_NODE(cls): # noqa + if cls._API_NODE is None: + cls.GET_SCHEMA() + return cls._API_NODE + + _OUTPUT_NODE = None + @final + @classproperty + def OUTPUT_NODE(cls): # noqa + if cls._OUTPUT_NODE is None: + cls.GET_SCHEMA() + return cls._OUTPUT_NODE + + _INPUT_IS_LIST = None + @final + @classproperty + def INPUT_IS_LIST(cls): # noqa + if cls._INPUT_IS_LIST is None: + cls.GET_SCHEMA() + return cls._INPUT_IS_LIST + _OUTPUT_IS_LIST = None + + @final + @classproperty + def OUTPUT_IS_LIST(cls): # noqa + if cls._OUTPUT_IS_LIST is None: + cls.GET_SCHEMA() + return cls._OUTPUT_IS_LIST + + _RETURN_TYPES = None + @final + @classproperty + def RETURN_TYPES(cls): # noqa + if cls._RETURN_TYPES is None: + cls.GET_SCHEMA() + return cls._RETURN_TYPES + + _RETURN_NAMES = None + @final + @classproperty + def RETURN_NAMES(cls): # noqa + if cls._RETURN_NAMES is None: + cls.GET_SCHEMA() + return cls._RETURN_NAMES + + _OUTPUT_TOOLTIPS = None + @final + @classproperty + def OUTPUT_TOOLTIPS(cls): # noqa + if cls._OUTPUT_TOOLTIPS is None: + cls.GET_SCHEMA() + return cls._OUTPUT_TOOLTIPS + + _NOT_IDEMPOTENT = None + @final + @classproperty + def NOT_IDEMPOTENT(cls): # noqa + if cls._NOT_IDEMPOTENT is None: + cls.GET_SCHEMA() + return cls._NOT_IDEMPOTENT + + @final + @classmethod + def INPUT_TYPES(cls, include_hidden=True, return_schema=False) -> dict[str, dict] | tuple[dict[str, dict], Schema]: + schema = cls.FINALIZE_SCHEMA() + info = schema.get_v1_info(cls) + input = info.input + if not include_hidden: + input.pop("hidden", None) + if return_schema: + return input, schema + return input + + @final + @classmethod + def FINALIZE_SCHEMA(cls): + """Call define_schema and finalize it.""" + schema = cls.define_schema() + schema.finalize() + return schema + + @final + @classmethod + def GET_SCHEMA(cls) -> Schema: + """Validate node class, finalize schema, validate schema, and set expected class properties.""" + cls.VALIDATE_CLASS() + schema = cls.FINALIZE_SCHEMA() + schema.validate() + if cls._DESCRIPTION is None: + cls._DESCRIPTION = schema.description + if cls._CATEGORY is None: + cls._CATEGORY = schema.category + if cls._EXPERIMENTAL is None: + cls._EXPERIMENTAL = schema.is_experimental + if cls._DEPRECATED is None: + cls._DEPRECATED = schema.is_deprecated + if cls._API_NODE is None: + cls._API_NODE = schema.is_api_node + if cls._OUTPUT_NODE is None: + cls._OUTPUT_NODE = schema.is_output_node + if cls._INPUT_IS_LIST is None: + cls._INPUT_IS_LIST = schema.is_input_list + if cls._NOT_IDEMPOTENT is None: + cls._NOT_IDEMPOTENT = schema.not_idempotent + + if cls._RETURN_TYPES is None: + output = [] + output_name = [] + output_is_list = [] + output_tooltips = [] + if schema.outputs: + for o in schema.outputs: + output.append(o.io_type) + output_name.append(o.display_name if o.display_name else o.io_type) + output_is_list.append(o.is_output_list) + output_tooltips.append(o.tooltip if o.tooltip else None) + + cls._RETURN_TYPES = output + cls._RETURN_NAMES = output_name + cls._OUTPUT_IS_LIST = output_is_list + cls._OUTPUT_TOOLTIPS = output_tooltips + cls.SCHEMA = schema + return schema + #-------------------------------------------- + ############################################# + + +class ComfyNode(_ComfyNodeBaseInternal): + """Common base class for all V3 nodes.""" + + @classmethod + @abstractmethod + def define_schema(cls) -> Schema: + """Override this function with one that returns a Schema instance.""" + raise NotImplementedError + + @classmethod + @abstractmethod + def execute(cls, **kwargs) -> NodeOutput: + """Override this function with one that performs node's actions.""" + raise NotImplementedError + + @classmethod + def validate_inputs(cls, **kwargs) -> bool: + """Optionally, define this function to validate inputs; equivalent to V1's VALIDATE_INPUTS.""" + raise NotImplementedError + + @classmethod + def fingerprint_inputs(cls, **kwargs) -> Any: + """Optionally, define this function to fingerprint inputs; equivalent to V1's IS_CHANGED.""" + raise NotImplementedError + + @classmethod + def check_lazy_status(cls, **kwargs) -> list[str]: + """Optionally, define this function to return a list of input names that should be evaluated. + + This basic mixin impl. requires all inputs. + + :kwargs: All node inputs will be included here. If the input is ``None``, it should be assumed that it has not yet been evaluated. \ + When using ``INPUT_IS_LIST = True``, unevaluated will instead be ``(None,)``. + + Params should match the nodes execution ``FUNCTION`` (self, and all inputs by name). + Will be executed repeatedly until it returns an empty list, or all requested items were already evaluated (and sent as params). + + Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lazy_evaluation#defining-check-lazy-status + """ + return [name for name in kwargs if kwargs[name] is None] + + @final + @classmethod + def GET_BASE_CLASS(cls): + """DO NOT override this class. Will break things in execution.py.""" + return ComfyNode + + +class NodeOutput(_NodeOutputInternal): + ''' + Standardized output of a node; can pass in any number of args and/or a UIOutput into 'ui' kwarg. + ''' + def __init__(self, *args: Any, ui: _UIOutput | dict=None, expand: dict=None, block_execution: str=None): + self.args = args + self.ui = ui + self.expand = expand + self.block_execution = block_execution + + @property + def result(self): + return self.args if len(self.args) > 0 else None + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "NodeOutput": + args = () + ui = None + expand = None + if "result" in data: + result = data["result"] + if isinstance(result, ExecutionBlocker): + return cls(block_execution=result.message) + args = result + if "ui" in data: + ui = data["ui"] + if "expand" in data: + expand = data["expand"] + return cls(args=args, ui=ui, expand=expand) + + def __getitem__(self, index) -> Any: + return self.args[index] + +class _UIOutput(ABC): + def __init__(self): + pass + + @abstractmethod + def as_dict(self) -> dict: + ... + + +class _IO: + FolderType = FolderType + UploadType = UploadType + RemoteOptions = RemoteOptions + NumberDisplay = NumberDisplay + + comfytype = staticmethod(comfytype) + Custom = staticmethod(Custom) + Input = Input + WidgetInput = WidgetInput + Output = Output + ComfyTypeI = ComfyTypeI + ComfyTypeIO = ComfyTypeIO + #--------------------------------- + # Supported Types + Boolean = Boolean + Int = Int + Float = Float + String = String + Combo = Combo + MultiCombo = MultiCombo + Image = Image + WanCameraEmbedding = WanCameraEmbedding + Webcam = Webcam + Mask = Mask + Latent = Latent + Conditioning = Conditioning + Sampler = Sampler + Sigmas = Sigmas + Noise = Noise + Guider = Guider + Clip = Clip + ControlNet = ControlNet + Vae = Vae + Model = Model + ClipVision = ClipVision + ClipVisionOutput = ClipVisionOutput + StyleModel = StyleModel + Gligen = Gligen + UpscaleModel = UpscaleModel + Audio = Audio + Video = Video + SVG = SVG + LoraModel = LoraModel + LossMap = LossMap + Voxel = Voxel + Mesh = Mesh + Hooks = Hooks + HookKeyframes = HookKeyframes + TimestepsRange = TimestepsRange + LatentOperation = LatentOperation + FlowControl = FlowControl + Accumulation = Accumulation + Load3DCamera = Load3DCamera + Load3D = Load3D + Load3DAnimation = Load3DAnimation + Photomaker = Photomaker + Point = Point + FaceAnalysis = FaceAnalysis + BBOX = BBOX + SEGS = SEGS + AnyType = AnyType + MultiType = MultiType + #--------------------------------- + HiddenHolder = HiddenHolder + Hidden = Hidden + NodeInfoV1 = NodeInfoV1 + NodeInfoV3 = NodeInfoV3 + Schema = Schema + ComfyNode = ComfyNode + NodeOutput = NodeOutput + add_to_dict_v1 = staticmethod(add_to_dict_v1) + add_to_dict_v3 = staticmethod(add_to_dict_v3) diff --git a/comfy_api/latest/_resources.py b/comfy_api/latest/_resources.py new file mode 100644 index 000000000..a6bdda972 --- /dev/null +++ b/comfy_api/latest/_resources.py @@ -0,0 +1,72 @@ +from __future__ import annotations +import comfy.utils +import folder_paths +import logging +from abc import ABC, abstractmethod +from typing import Any +import torch + +class ResourceKey(ABC): + Type = Any + def __init__(self): + ... + +class TorchDictFolderFilename(ResourceKey): + '''Key for requesting a torch file via file_name from a folder category.''' + Type = dict[str, torch.Tensor] + def __init__(self, folder_name: str, file_name: str): + self.folder_name = folder_name + self.file_name = file_name + + def __hash__(self): + return hash((self.folder_name, self.file_name)) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, TorchDictFolderFilename): + return False + return self.folder_name == other.folder_name and self.file_name == other.file_name + + def __str__(self): + return f"{self.folder_name} -> {self.file_name}" + +class Resources(ABC): + def __init__(self): + ... + + @abstractmethod + def get(self, key: ResourceKey, default: Any=...) -> Any: + pass + +class ResourcesLocal(Resources): + def __init__(self): + super().__init__() + self.local_resources: dict[ResourceKey, Any] = {} + + def get(self, key: ResourceKey, default: Any=...) -> Any: + cached = self.local_resources.get(key, None) + if cached is not None: + logging.info(f"Using cached resource '{key}'") + return cached + logging.info(f"Loading resource '{key}'") + to_return = None + if isinstance(key, TorchDictFolderFilename): + if default is ...: + to_return = comfy.utils.load_torch_file(folder_paths.get_full_path_or_raise(key.folder_name, key.file_name), safe_load=True) + else: + full_path = folder_paths.get_full_path(key.folder_name, key.file_name) + if full_path is not None: + to_return = comfy.utils.load_torch_file(full_path, safe_load=True) + + if to_return is not None: + self.local_resources[key] = to_return + return to_return + if default is not ...: + return default + raise Exception(f"Unsupported resource key type: {type(key)}") + + +class _RESOURCES: + ResourceKey = ResourceKey + TorchDictFolderFilename = TorchDictFolderFilename + Resources = Resources + ResourcesLocal = ResourcesLocal diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py new file mode 100644 index 000000000..6b8a39d58 --- /dev/null +++ b/comfy_api/latest/_ui.py @@ -0,0 +1,457 @@ +from __future__ import annotations + +import json +import os +import random +from io import BytesIO +from typing import Type + +import av +import numpy as np +import torch +import torchaudio +from PIL import Image as PILImage +from PIL.PngImagePlugin import PngInfo + +import folder_paths + +# used for image preview +from comfy.cli_args import args +from comfy_api.latest._io import ComfyNode, FolderType, Image, _UIOutput + + +class SavedResult(dict): + def __init__(self, filename: str, subfolder: str, type: FolderType): + super().__init__(filename=filename, subfolder=subfolder,type=type.value) + + @property + def filename(self) -> str: + return self["filename"] + + @property + def subfolder(self) -> str: + return self["subfolder"] + + @property + def type(self) -> FolderType: + return FolderType(self["type"]) + + +class SavedImages(_UIOutput): + """A UI output class to represent one or more saved images, potentially animated.""" + def __init__(self, results: list[SavedResult], is_animated: bool = False): + super().__init__() + self.results = results + self.is_animated = is_animated + + def as_dict(self) -> dict: + data = {"images": self.results} + if self.is_animated: + data["animated"] = (True,) + return data + + +class SavedAudios(_UIOutput): + """UI wrapper around one or more audio files on disk (FLAC / MP3 / Opus).""" + def __init__(self, results: list[SavedResult]): + super().__init__() + self.results = results + + def as_dict(self) -> dict: + return {"audio": self.results} + + +def _get_directory_by_folder_type(folder_type: FolderType) -> str: + if folder_type == FolderType.input: + return folder_paths.get_input_directory() + if folder_type == FolderType.output: + return folder_paths.get_output_directory() + return folder_paths.get_temp_directory() + + +class ImageSaveHelper: + """A helper class with static methods to handle image saving and metadata.""" + + @staticmethod + def _convert_tensor_to_pil(image_tensor: torch.Tensor) -> PILImage.Image: + """Converts a single torch tensor to a PIL Image.""" + return PILImage.fromarray(np.clip(255.0 * image_tensor.cpu().numpy(), 0, 255).astype(np.uint8)) + + @staticmethod + def _create_png_metadata(cls: Type[ComfyNode] | None) -> PngInfo | None: + """Creates a PngInfo object with prompt and extra_pnginfo.""" + if args.disable_metadata or cls is None or not cls.hidden: + return None + metadata = PngInfo() + if cls.hidden.prompt: + metadata.add_text("prompt", json.dumps(cls.hidden.prompt)) + if cls.hidden.extra_pnginfo: + for x in cls.hidden.extra_pnginfo: + metadata.add_text(x, json.dumps(cls.hidden.extra_pnginfo[x])) + return metadata + + @staticmethod + def _create_animated_png_metadata(cls: Type[ComfyNode] | None) -> PngInfo | None: + """Creates a PngInfo object with prompt and extra_pnginfo for animated PNGs (APNG).""" + if args.disable_metadata or cls is None or not cls.hidden: + return None + metadata = PngInfo() + if cls.hidden.prompt: + metadata.add( + b"comf", + "prompt".encode("latin-1", "strict") + + b"\0" + + json.dumps(cls.hidden.prompt).encode("latin-1", "strict"), + after_idat=True, + ) + if cls.hidden.extra_pnginfo: + for x in cls.hidden.extra_pnginfo: + metadata.add( + b"comf", + x.encode("latin-1", "strict") + + b"\0" + + json.dumps(cls.hidden.extra_pnginfo[x]).encode("latin-1", "strict"), + after_idat=True, + ) + return metadata + + @staticmethod + def _create_webp_metadata(pil_image: PILImage.Image, cls: Type[ComfyNode] | None) -> PILImage.Exif: + """Creates EXIF metadata bytes for WebP images.""" + exif_data = pil_image.getexif() + if args.disable_metadata or cls is None or cls.hidden is None: + return exif_data + if cls.hidden.prompt is not None: + exif_data[0x0110] = "prompt:{}".format(json.dumps(cls.hidden.prompt)) # EXIF 0x0110 = Model + if cls.hidden.extra_pnginfo is not None: + inital_exif_tag = 0x010F # EXIF 0x010f = Make + for key, value in cls.hidden.extra_pnginfo.items(): + exif_data[inital_exif_tag] = "{}:{}".format(key, json.dumps(value)) + inital_exif_tag -= 1 + return exif_data + + @staticmethod + def save_images( + images, filename_prefix: str, folder_type: FolderType, cls: Type[ComfyNode] | None, compress_level = 4, + ) -> list[SavedResult]: + """Saves a batch of images as individual PNG files.""" + full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path( + filename_prefix, _get_directory_by_folder_type(folder_type), images[0].shape[1], images[0].shape[0] + ) + results = [] + metadata = ImageSaveHelper._create_png_metadata(cls) + for batch_number, image_tensor in enumerate(images): + img = ImageSaveHelper._convert_tensor_to_pil(image_tensor) + filename_with_batch_num = filename.replace("%batch_num%", str(batch_number)) + file = f"{filename_with_batch_num}_{counter:05}_.png" + img.save(os.path.join(full_output_folder, file), pnginfo=metadata, compress_level=compress_level) + results.append(SavedResult(file, subfolder, folder_type)) + counter += 1 + return results + + @staticmethod + def get_save_images_ui(images, filename_prefix: str, cls: Type[ComfyNode] | None, compress_level=4) -> SavedImages: + """Saves a batch of images and returns a UI object for the node output.""" + return SavedImages( + ImageSaveHelper.save_images( + images, + filename_prefix=filename_prefix, + folder_type=FolderType.output, + cls=cls, + compress_level=compress_level, + ) + ) + + @staticmethod + def save_animated_png( + images, filename_prefix: str, folder_type: FolderType, cls: Type[ComfyNode] | None, fps: float, compress_level: int + ) -> SavedResult: + """Saves a batch of images as a single animated PNG.""" + full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path( + filename_prefix, _get_directory_by_folder_type(folder_type), images[0].shape[1], images[0].shape[0] + ) + pil_images = [ImageSaveHelper._convert_tensor_to_pil(img) for img in images] + metadata = ImageSaveHelper._create_animated_png_metadata(cls) + file = f"{filename}_{counter:05}_.png" + save_path = os.path.join(full_output_folder, file) + pil_images[0].save( + save_path, + pnginfo=metadata, + compress_level=compress_level, + save_all=True, + duration=int(1000.0 / fps), + append_images=pil_images[1:], + ) + return SavedResult(file, subfolder, folder_type) + + @staticmethod + def get_save_animated_png_ui( + images, filename_prefix: str, cls: Type[ComfyNode] | None, fps: float, compress_level: int + ) -> SavedImages: + """Saves an animated PNG and returns a UI object for the node output.""" + result = ImageSaveHelper.save_animated_png( + images, + filename_prefix=filename_prefix, + folder_type=FolderType.output, + cls=cls, + fps=fps, + compress_level=compress_level, + ) + return SavedImages([result], is_animated=len(images) > 1) + + @staticmethod + def save_animated_webp( + images, + filename_prefix: str, + folder_type: FolderType, + cls: Type[ComfyNode] | None, + fps: float, + lossless: bool, + quality: int, + method: int, + ) -> SavedResult: + """Saves a batch of images as a single animated WebP.""" + full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path( + filename_prefix, _get_directory_by_folder_type(folder_type), images[0].shape[1], images[0].shape[0] + ) + pil_images = [ImageSaveHelper._convert_tensor_to_pil(img) for img in images] + pil_exif = ImageSaveHelper._create_webp_metadata(pil_images[0], cls) + file = f"{filename}_{counter:05}_.webp" + pil_images[0].save( + os.path.join(full_output_folder, file), + save_all=True, + duration=int(1000.0 / fps), + append_images=pil_images[1:], + exif=pil_exif, + lossless=lossless, + quality=quality, + method=method, + ) + return SavedResult(file, subfolder, folder_type) + + @staticmethod + def get_save_animated_webp_ui( + images, + filename_prefix: str, + cls: Type[ComfyNode] | None, + fps: float, + lossless: bool, + quality: int, + method: int, + ) -> SavedImages: + """Saves an animated WebP and returns a UI object for the node output.""" + result = ImageSaveHelper.save_animated_webp( + images, + filename_prefix=filename_prefix, + folder_type=FolderType.output, + cls=cls, + fps=fps, + lossless=lossless, + quality=quality, + method=method, + ) + return SavedImages([result], is_animated=len(images) > 1) + + +class AudioSaveHelper: + """A helper class with static methods to handle audio saving and metadata.""" + _OPUS_RATES = [8000, 12000, 16000, 24000, 48000] + + @staticmethod + def save_audio( + audio: dict, + filename_prefix: str, + folder_type: FolderType, + cls: Type[ComfyNode] | None, + format: str = "flac", + quality: str = "128k", + ) -> list[SavedResult]: + full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path( + filename_prefix, _get_directory_by_folder_type(folder_type) + ) + + metadata = {} + if not args.disable_metadata and cls is not None: + if cls.hidden.prompt is not None: + metadata["prompt"] = json.dumps(cls.hidden.prompt) + if cls.hidden.extra_pnginfo is not None: + for x in cls.hidden.extra_pnginfo: + metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x]) + + results = [] + for batch_number, waveform in enumerate(audio["waveform"].cpu()): + filename_with_batch_num = filename.replace("%batch_num%", str(batch_number)) + file = f"{filename_with_batch_num}_{counter:05}_.{format}" + output_path = os.path.join(full_output_folder, file) + + # Use original sample rate initially + sample_rate = audio["sample_rate"] + + # Handle Opus sample rate requirements + if format == "opus": + if sample_rate > 48000: + sample_rate = 48000 + elif sample_rate not in AudioSaveHelper._OPUS_RATES: + # Find the next highest supported rate + for rate in sorted(AudioSaveHelper._OPUS_RATES): + if rate > sample_rate: + sample_rate = rate + break + if sample_rate not in AudioSaveHelper._OPUS_RATES: # Fallback if still not supported + sample_rate = 48000 + + # Resample if necessary + if sample_rate != audio["sample_rate"]: + waveform = torchaudio.functional.resample(waveform, audio["sample_rate"], sample_rate) + + # Create output with specified format + output_buffer = BytesIO() + output_container = av.open(output_buffer, mode="w", format=format) + + # Set metadata on the container + for key, value in metadata.items(): + output_container.metadata[key] = value + + # Set up the output stream with appropriate properties + if format == "opus": + out_stream = output_container.add_stream("libopus", rate=sample_rate) + if quality == "64k": + out_stream.bit_rate = 64000 + elif quality == "96k": + out_stream.bit_rate = 96000 + elif quality == "128k": + out_stream.bit_rate = 128000 + elif quality == "192k": + out_stream.bit_rate = 192000 + elif quality == "320k": + out_stream.bit_rate = 320000 + elif format == "mp3": + out_stream = output_container.add_stream("libmp3lame", rate=sample_rate) + if quality == "V0": + # TODO i would really love to support V3 and V5 but there doesn't seem to be a way to set the qscale level, the property below is a bool + out_stream.codec_context.qscale = 1 + elif quality == "128k": + out_stream.bit_rate = 128000 + elif quality == "320k": + out_stream.bit_rate = 320000 + else: # format == "flac": + out_stream = output_container.add_stream("flac", rate=sample_rate) + + frame = av.AudioFrame.from_ndarray( + waveform.movedim(0, 1).reshape(1, -1).float().numpy(), + format="flt", + layout="mono" if waveform.shape[0] == 1 else "stereo", + ) + frame.sample_rate = sample_rate + frame.pts = 0 + output_container.mux(out_stream.encode(frame)) + + # Flush encoder + output_container.mux(out_stream.encode(None)) + + # Close containers + output_container.close() + + # Write the output to file + output_buffer.seek(0) + with open(output_path, "wb") as f: + f.write(output_buffer.getbuffer()) + + results.append(SavedResult(file, subfolder, folder_type)) + counter += 1 + + return results + + @staticmethod + def get_save_audio_ui( + audio, filename_prefix: str, cls: Type[ComfyNode] | None, format: str = "flac", quality: str = "128k", + ) -> SavedAudios: + """Save and instantly wrap for UI.""" + return SavedAudios( + AudioSaveHelper.save_audio( + audio, + filename_prefix=filename_prefix, + folder_type=FolderType.output, + cls=cls, + format=format, + quality=quality, + ) + ) + + +class PreviewImage(_UIOutput): + def __init__(self, image: Image.Type, animated: bool = False, cls: Type[ComfyNode] = None, **kwargs): + self.values = ImageSaveHelper.save_images( + image, + filename_prefix="ComfyUI_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for _ in range(5)), + folder_type=FolderType.temp, + cls=cls, + compress_level=1, + ) + self.animated = animated + + def as_dict(self): + return { + "images": self.values, + "animated": (self.animated,) + } + + +class PreviewMask(PreviewImage): + def __init__(self, mask: PreviewMask.Type, animated: bool=False, cls: ComfyNode=None, **kwargs): + preview = mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])).movedim(1, -1).expand(-1, -1, -1, 3) + super().__init__(preview, animated, cls, **kwargs) + + +class PreviewAudio(_UIOutput): + def __init__(self, audio: dict, cls: Type[ComfyNode] = None, **kwargs): + self.values = AudioSaveHelper.save_audio( + audio, + filename_prefix="ComfyUI_temp_" + "".join(random.choice("abcdefghijklmnopqrstuvwxyz") for _ in range(5)), + folder_type=FolderType.temp, + cls=cls, + format="flac", + quality="128k", + ) + + def as_dict(self) -> dict: + return {"audio": self.values} + + +class PreviewVideo(_UIOutput): + def __init__(self, values: list[SavedResult | dict], **kwargs): + self.values = values + + def as_dict(self): + return {"images": self.values, "animated": (True,)} + + +class PreviewUI3D(_UIOutput): + def __init__(self, model_file, camera_info, **kwargs): + self.model_file = model_file + self.camera_info = camera_info + + def as_dict(self): + return {"result": [self.model_file, self.camera_info]} + + +class PreviewText(_UIOutput): + def __init__(self, value: str, **kwargs): + self.value = value + + def as_dict(self): + return {"text": (self.value,)} + + +class _UI: + SavedResult = SavedResult + SavedImages = SavedImages + SavedAudios = SavedAudios + ImageSaveHelper = ImageSaveHelper + AudioSaveHelper = AudioSaveHelper + PreviewImage = PreviewImage + PreviewMask = PreviewMask + PreviewAudio = PreviewAudio + PreviewVideo = PreviewVideo + PreviewUI3D = PreviewUI3D + PreviewText = PreviewText diff --git a/comfy_api/v0_0_2/__init__.py b/comfy_api/v0_0_2/__init__.py index ea83833fb..de0f95001 100644 --- a/comfy_api/v0_0_2/__init__.py +++ b/comfy_api/v0_0_2/__init__.py @@ -6,6 +6,7 @@ from comfy_api.latest import ( ) from typing import Type, TYPE_CHECKING from comfy_api.internal.async_to_sync import create_sync_class +from comfy_api.latest import io, ui, ComfyExtension #noqa: F401 class ComfyAPIAdapter_v0_0_2(ComfyAPI_latest): @@ -40,4 +41,5 @@ __all__ = [ "Input", "InputImpl", "Types", + "ComfyExtension", ] diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index 60e2ab91e..f4b427265 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -4,9 +4,12 @@ from typing import Type, Literal import nodes import asyncio import inspect -from comfy_execution.graph_utils import is_link +from comfy_execution.graph_utils import is_link, ExecutionBlocker from comfy.comfy_types.node_typing import ComfyNodeABC, InputTypeDict, InputTypeOptions +# NOTE: ExecutionBlocker code got moved to graph_utils.py to prevent torch being imported too soon during unit tests +ExecutionBlocker = ExecutionBlocker + class DependencyCycleError(Exception): pass @@ -294,21 +297,3 @@ class ExecutionList(TopologicalSort): del blocked_by[node_id] to_remove = [node_id for node_id in blocked_by if len(blocked_by[node_id]) == 0] return list(blocked_by.keys()) - -class ExecutionBlocker: - """ - Return this from a node and any users will be blocked with the given error message. - If the message is None, execution will be blocked silently instead. - Generally, you should avoid using this functionality unless absolutely necessary. Whenever it's - possible, a lazy input will be more efficient and have a better user experience. - This functionality is useful in two cases: - 1. You want to conditionally prevent an output node from executing. (Particularly a built-in node - like SaveImage. For your own output nodes, I would recommend just adding a BOOL input and using - lazy evaluation to let it conditionally disable itself.) - 2. You have a node with multiple possible outputs, some of which are invalid and should not be used. - (I would recommend not making nodes like this in the future -- instead, make multiple nodes with - different outputs. Unfortunately, there are several popular existing nodes using this pattern.) - """ - def __init__(self, message): - self.message = message - diff --git a/comfy_execution/graph_utils.py b/comfy_execution/graph_utils.py index 8595e942d..496d2c634 100644 --- a/comfy_execution/graph_utils.py +++ b/comfy_execution/graph_utils.py @@ -137,3 +137,19 @@ def add_graph_prefix(graph, outputs, prefix): return new_graph, tuple(new_outputs) +class ExecutionBlocker: + """ + Return this from a node and any users will be blocked with the given error message. + If the message is None, execution will be blocked silently instead. + Generally, you should avoid using this functionality unless absolutely necessary. Whenever it's + possible, a lazy input will be more efficient and have a better user experience. + This functionality is useful in two cases: + 1. You want to conditionally prevent an output node from executing. (Particularly a built-in node + like SaveImage. For your own output nodes, I would recommend just adding a BOOL input and using + lazy evaluation to let it conditionally disable itself.) + 2. You have a node with multiple possible outputs, some of which are invalid and should not be used. + (I would recommend not making nodes like this in the future -- instead, make multiple nodes with + different outputs. Unfortunately, there are several popular existing nodes using this pattern.) + """ + def __init__(self, message): + self.message = message diff --git a/execution.py b/execution.py index cde14c52f..952f0cc5c 100644 --- a/execution.py +++ b/execution.py @@ -32,6 +32,8 @@ from comfy_execution.graph_utils import GraphBuilder, is_link from comfy_execution.validation import validate_node_input from comfy_execution.progress import get_progress_state, reset_progress_state, add_progress_handler, WebUIProgressHandler from comfy_execution.utils import CurrentNodeContext +from comfy_api.internal import _ComfyNodeInternal, _NodeOutputInternal, first_real_override, is_class, make_locked_method_func +from comfy_api.latest import io class ExecutionResult(Enum): @@ -56,7 +58,15 @@ class IsChangedCache: node = self.dynprompt.get_node(node_id) class_type = node["class_type"] class_def = nodes.NODE_CLASS_MAPPINGS[class_type] - if not hasattr(class_def, "IS_CHANGED"): + has_is_changed = False + is_changed_name = None + if issubclass(class_def, _ComfyNodeInternal) and first_real_override(class_def, "fingerprint_inputs") is not None: + has_is_changed = True + is_changed_name = "fingerprint_inputs" + elif hasattr(class_def, "IS_CHANGED"): + has_is_changed = True + is_changed_name = "IS_CHANGED" + if not has_is_changed: self.is_changed[node_id] = False return self.is_changed[node_id] @@ -65,9 +75,9 @@ class IsChangedCache: return self.is_changed[node_id] # Intentionally do not use cached outputs here. We only want constants in IS_CHANGED - input_data_all, _ = get_input_data(node["inputs"], class_def, node_id, None) + input_data_all, _, hidden_inputs = get_input_data(node["inputs"], class_def, node_id, None) try: - is_changed = await _async_map_node_over_list(self.prompt_id, node_id, class_def, input_data_all, "IS_CHANGED") + is_changed = await _async_map_node_over_list(self.prompt_id, node_id, class_def, input_data_all, is_changed_name) is_changed = await resolve_map_node_over_list_results(is_changed) node["is_changed"] = [None if isinstance(x, ExecutionBlocker) else x for x in is_changed] except Exception as e: @@ -126,9 +136,14 @@ class CacheSet: SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org") def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, extra_data={}): - valid_inputs = class_def.INPUT_TYPES() + is_v3 = issubclass(class_def, _ComfyNodeInternal) + if is_v3: + valid_inputs, schema = class_def.INPUT_TYPES(include_hidden=False, return_schema=True) + else: + valid_inputs = class_def.INPUT_TYPES() input_data_all = {} missing_keys = {} + hidden_inputs_v3 = {} for x in inputs: input_data = inputs[x] _, input_category, input_info = get_input_info(class_def, x, valid_inputs) @@ -153,22 +168,37 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e elif input_category is not None: input_data_all[x] = [input_data] - if "hidden" in valid_inputs: - h = valid_inputs["hidden"] - for x in h: - if h[x] == "PROMPT": - input_data_all[x] = [dynprompt.get_original_prompt() if dynprompt is not None else {}] - if h[x] == "DYNPROMPT": - input_data_all[x] = [dynprompt] - if h[x] == "EXTRA_PNGINFO": - input_data_all[x] = [extra_data.get('extra_pnginfo', None)] - if h[x] == "UNIQUE_ID": - input_data_all[x] = [unique_id] - if h[x] == "AUTH_TOKEN_COMFY_ORG": - input_data_all[x] = [extra_data.get("auth_token_comfy_org", None)] - if h[x] == "API_KEY_COMFY_ORG": - input_data_all[x] = [extra_data.get("api_key_comfy_org", None)] - return input_data_all, missing_keys + if is_v3: + if schema.hidden: + if io.Hidden.prompt in schema.hidden: + hidden_inputs_v3[io.Hidden.prompt] = dynprompt.get_original_prompt() if dynprompt is not None else {} + if io.Hidden.dynprompt in schema.hidden: + hidden_inputs_v3[io.Hidden.dynprompt] = dynprompt + if io.Hidden.extra_pnginfo in schema.hidden: + hidden_inputs_v3[io.Hidden.extra_pnginfo] = extra_data.get('extra_pnginfo', None) + if io.Hidden.unique_id in schema.hidden: + hidden_inputs_v3[io.Hidden.unique_id] = unique_id + if io.Hidden.auth_token_comfy_org in schema.hidden: + hidden_inputs_v3[io.Hidden.auth_token_comfy_org] = extra_data.get("auth_token_comfy_org", None) + if io.Hidden.api_key_comfy_org in schema.hidden: + hidden_inputs_v3[io.Hidden.api_key_comfy_org] = extra_data.get("api_key_comfy_org", None) + else: + if "hidden" in valid_inputs: + h = valid_inputs["hidden"] + for x in h: + if h[x] == "PROMPT": + input_data_all[x] = [dynprompt.get_original_prompt() if dynprompt is not None else {}] + if h[x] == "DYNPROMPT": + input_data_all[x] = [dynprompt] + if h[x] == "EXTRA_PNGINFO": + input_data_all[x] = [extra_data.get('extra_pnginfo', None)] + if h[x] == "UNIQUE_ID": + input_data_all[x] = [unique_id] + if h[x] == "AUTH_TOKEN_COMFY_ORG": + input_data_all[x] = [extra_data.get("auth_token_comfy_org", None)] + if h[x] == "API_KEY_COMFY_ORG": + input_data_all[x] = [extra_data.get("api_key_comfy_org", None)] + return input_data_all, missing_keys, hidden_inputs_v3 map_node_over_list = None #Don't hook this please @@ -184,7 +214,7 @@ async def resolve_map_node_over_list_results(results): raise exc return [x.result() if isinstance(x, asyncio.Task) else x for x in results] -async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None): +async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None, hidden_inputs=None): # check if node wants the lists input_is_list = getattr(obj, "INPUT_IS_LIST", False) @@ -214,7 +244,22 @@ async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, f if execution_block is None: if pre_execute_cb is not None and index is not None: pre_execute_cb(index) - f = getattr(obj, func) + # V3 + if isinstance(obj, _ComfyNodeInternal) or (is_class(obj) and issubclass(obj, _ComfyNodeInternal)): + # if is just a class, then assign no resources or state, just create clone + if is_class(obj): + type_obj = obj + obj.VALIDATE_CLASS() + class_clone = obj.PREPARE_CLASS_CLONE(hidden_inputs) + # otherwise, use class instance to populate/reuse some fields + else: + type_obj = type(obj) + type_obj.VALIDATE_CLASS() + class_clone = type_obj.PREPARE_CLASS_CLONE(hidden_inputs) + f = make_locked_method_func(type_obj, func, class_clone) + # V1 + else: + f = getattr(obj, func) if inspect.iscoroutinefunction(f): async def async_wrapper(f, prompt_id, unique_id, list_index, args): with CurrentNodeContext(prompt_id, unique_id, list_index): @@ -266,8 +311,8 @@ def merge_result_data(results, obj): output.append([o[i] for o in results]) return output -async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None): - return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb) +async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None, hidden_inputs=None): + return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, hidden_inputs=hidden_inputs) has_pending_task = any(isinstance(r, asyncio.Task) and not r.done() for r in return_values) if has_pending_task: return return_values, {}, False, has_pending_task @@ -298,6 +343,26 @@ def get_output_from_returns(return_values, obj): result = tuple([result] * len(obj.RETURN_TYPES)) results.append(result) subgraph_results.append((None, result)) + elif isinstance(r, _NodeOutputInternal): + # V3 + if r.ui is not None: + if isinstance(r.ui, dict): + uis.append(r.ui) + else: + uis.append(r.ui.as_dict()) + if r.expand is not None: + has_subgraph = True + new_graph = r.expand + result = r.result + if r.block_execution is not None: + result = tuple([ExecutionBlocker(r.block_execution)] * len(obj.RETURN_TYPES)) + subgraph_results.append((new_graph, result)) + elif r.result is not None: + result = r.result + if r.block_execution is not None: + result = tuple([ExecutionBlocker(r.block_execution)] * len(obj.RETURN_TYPES)) + results.append(result) + subgraph_results.append((None, result)) else: if isinstance(r, ExecutionBlocker): r = tuple([r] * len(obj.RETURN_TYPES)) @@ -381,7 +446,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, has_subgraph = False else: get_progress_state().start_progress(unique_id) - input_data_all, missing_keys = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data) + input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data) if server.client_id is not None: server.last_node_id = display_node_id server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id) @@ -391,8 +456,12 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, obj = class_def() caches.objects.set(unique_id, obj) - if hasattr(obj, "check_lazy_status"): - required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True) + if issubclass(class_def, _ComfyNodeInternal): + lazy_status_present = first_real_override(class_def, "check_lazy_status") is not None + else: + lazy_status_present = getattr(obj, "check_lazy_status", None) is not None + if lazy_status_present: + required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True, hidden_inputs=hidden_inputs) required_inputs = await resolve_map_node_over_list_results(required_inputs) required_inputs = set(sum([r for r in required_inputs if isinstance(r,list)], [])) required_inputs = [x for x in required_inputs if isinstance(x,str) and ( @@ -424,7 +493,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, def pre_execute_cb(call_index): # TODO - How to handle this with async functions without contextvars (which requires Python 3.12)? GraphBuilder.set_default_prefix(unique_id, call_index, 0) - output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb) + output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, hidden_inputs=hidden_inputs) if has_pending_tasks: pending_async_nodes[unique_id] = output_data unblock = execution_list.add_external_block(unique_id) @@ -672,8 +741,14 @@ async def validate_inputs(prompt_id, prompt, item, validated): validate_function_inputs = [] validate_has_kwargs = False - if hasattr(obj_class, "VALIDATE_INPUTS"): - argspec = inspect.getfullargspec(obj_class.VALIDATE_INPUTS) + if issubclass(obj_class, _ComfyNodeInternal): + validate_function_name = "validate_inputs" + validate_function = first_real_override(obj_class, validate_function_name) + else: + validate_function_name = "VALIDATE_INPUTS" + validate_function = getattr(obj_class, validate_function_name, None) + if validate_function is not None: + argspec = inspect.getfullargspec(validate_function) validate_function_inputs = argspec.args validate_has_kwargs = argspec.varkw is not None received_types = {} @@ -848,7 +923,7 @@ async def validate_inputs(prompt_id, prompt, item, validated): continue if len(validate_function_inputs) > 0 or validate_has_kwargs: - input_data_all, _ = get_input_data(inputs, obj_class, unique_id) + input_data_all, _, hidden_inputs = get_input_data(inputs, obj_class, unique_id) input_filtered = {} for x in input_data_all: if x in validate_function_inputs or validate_has_kwargs: @@ -856,8 +931,7 @@ async def validate_inputs(prompt_id, prompt, item, validated): if 'input_types' in validate_function_inputs: input_filtered['input_types'] = [received_types] - #ret = obj_class.VALIDATE_INPUTS(**input_filtered) - ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, "VALIDATE_INPUTS") + ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, validate_function_name, hidden_inputs=hidden_inputs) ret = await resolve_map_node_over_list_results(ret) for x in input_filtered: for i, r in enumerate(ret): diff --git a/nodes.py b/nodes.py index 54e530388..da4a46366 100644 --- a/nodes.py +++ b/nodes.py @@ -6,6 +6,7 @@ import os import sys import json import hashlib +import inspect import traceback import math import time @@ -29,6 +30,7 @@ import comfy.controlnet from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict, FileLocator from comfy_api.internal import register_versions, ComfyAPIWithVersion from comfy_api.version_list import supported_versions +from comfy_api.latest import io, ComfyExtension import comfy.clip_vision @@ -2152,6 +2154,7 @@ async def load_custom_node(module_path: str, ignore=set(), module_parent="custom if os.path.isdir(web_dir): EXTENSION_WEB_DIRS[module_name] = web_dir + # V1 node definition if hasattr(module, "NODE_CLASS_MAPPINGS") and getattr(module, "NODE_CLASS_MAPPINGS") is not None: for name, node_cls in module.NODE_CLASS_MAPPINGS.items(): if name not in ignore: @@ -2160,8 +2163,38 @@ async def load_custom_node(module_path: str, ignore=set(), module_parent="custom if hasattr(module, "NODE_DISPLAY_NAME_MAPPINGS") and getattr(module, "NODE_DISPLAY_NAME_MAPPINGS") is not None: NODE_DISPLAY_NAME_MAPPINGS.update(module.NODE_DISPLAY_NAME_MAPPINGS) return True + # V3 Extension Definition + elif hasattr(module, "comfy_entrypoint"): + entrypoint = getattr(module, "comfy_entrypoint") + if not callable(entrypoint): + logging.warning(f"comfy_entrypoint in {module_path} is not callable, skipping.") + return False + try: + if inspect.iscoroutinefunction(entrypoint): + extension = await entrypoint() + else: + extension = entrypoint() + if not isinstance(extension, ComfyExtension): + logging.warning(f"comfy_entrypoint in {module_path} did not return a ComfyExtension, skipping.") + return False + node_list = await extension.get_node_list() + if not isinstance(node_list, list): + logging.warning(f"comfy_entrypoint in {module_path} did not return a list of nodes, skipping.") + return False + for node_cls in node_list: + node_cls: io.ComfyNode + schema = node_cls.GET_SCHEMA() + if schema.node_id not in ignore: + NODE_CLASS_MAPPINGS[schema.node_id] = node_cls + node_cls.RELATIVE_PYTHON_MODULE = "{}.{}".format(module_parent, get_module_name(module_path)) + if schema.display_name is not None: + NODE_DISPLAY_NAME_MAPPINGS[schema.node_id] = schema.display_name + return True + except Exception as e: + logging.warning(f"Error while calling comfy_entrypoint in {module_path}: {e}") + return False else: - logging.warning(f"Skip {module_path} module for custom nodes due to the lack of NODE_CLASS_MAPPINGS.") + logging.warning(f"Skip {module_path} module for custom nodes due to the lack of NODE_CLASS_MAPPINGS or NODES_LIST (need one).") return False except Exception as e: logging.warning(traceback.format_exc()) @@ -2286,7 +2319,7 @@ async def init_builtin_extra_nodes(): "nodes_string.py", "nodes_camera_trajectory.py", "nodes_edit_model.py", - "nodes_tcfg.py" + "nodes_tcfg.py", ] import_failed = [] diff --git a/server.py b/server.py index 3e06d2fbb..0553a0dd7 100644 --- a/server.py +++ b/server.py @@ -30,6 +30,7 @@ from comfy_api import feature_flags import node_helpers from comfyui_version import __version__ from app.frontend_management import FrontendManager +from comfy_api.internal import _ComfyNodeInternal from app.user_manager import UserManager from app.model_manager import ModelFileManager @@ -591,6 +592,8 @@ class PromptServer(): def node_info(node_class): obj_class = nodes.NODE_CLASS_MAPPINGS[node_class] + if issubclass(obj_class, _ComfyNodeInternal): + return obj_class.GET_NODE_INFO_V1() info = {} info['input'] = obj_class.INPUT_TYPES() info['input_order'] = {key: list(value.keys()) for (key, value) in obj_class.INPUT_TYPES().items()} From 5ee381c058d606209dcafb568af20196e7884fc8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 31 Jul 2025 20:33:27 -0700 Subject: [PATCH 083/759] Fix WanFirstLastFrameToVideo node when no clip vision. (#9134) --- comfy_extras/nodes_wan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 0b92c68ac..0067d054d 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -149,6 +149,7 @@ class WanFirstLastFrameToVideo: positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) + clip_vision_output = None if clip_vision_start_image is not None: clip_vision_output = clip_vision_start_image From 4696d74305e98a96bda5685b7f11d6ba167c2ed3 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Fri, 1 Aug 2025 15:06:18 +0800 Subject: [PATCH 084/759] update template to 0.1.45 (#9135) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8f2f6a56c..3828c5b91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.44 +comfyui-workflow-templates==0.1.45 comfyui-embedded-docs==0.2.4 torch torchsde From 1e638a140b2f459595fafc73ade5ea5b4024d4b4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 1 Aug 2025 02:25:38 -0700 Subject: [PATCH 085/759] Tiny wan vae optimizations. (#9136) --- comfy/ldm/wan/vae.py | 13 +++++++++---- comfy/ldm/wan/vae2_2.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/comfy/ldm/wan/vae.py b/comfy/ldm/wan/vae.py index 6b07840fc..791596938 100644 --- a/comfy/ldm/wan/vae.py +++ b/comfy/ldm/wan/vae.py @@ -24,12 +24,17 @@ class CausalConv3d(ops.Conv3d): self.padding[1], 2 * self.padding[0], 0) self.padding = (0, 0, 0) - def forward(self, x, cache_x=None): + def forward(self, x, cache_x=None, cache_list=None, cache_idx=None): + if cache_list is not None: + cache_x = cache_list[cache_idx] + cache_list[cache_idx] = None + padding = list(self._padding) if cache_x is not None and self._padding[4] > 0: cache_x = cache_x.to(x.device) x = torch.cat([cache_x, x], dim=2) padding[4] -= cache_x.shape[2] + del cache_x x = F.pad(x, padding) return super().forward(x) @@ -166,7 +171,7 @@ class ResidualBlock(nn.Module): if in_dim != out_dim else nn.Identity() def forward(self, x, feat_cache=None, feat_idx=[0]): - h = self.shortcut(x) + old_x = x for layer in self.residual: if isinstance(layer, CausalConv3d) and feat_cache is not None: idx = feat_idx[0] @@ -178,12 +183,12 @@ class ResidualBlock(nn.Module): cache_x.device), cache_x ], dim=2) - x = layer(x, feat_cache[idx]) + x = layer(x, cache_list=feat_cache, cache_idx=idx) feat_cache[idx] = cache_x feat_idx[0] += 1 else: x = layer(x) - return x + h + return x + self.shortcut(old_x) class AttentionBlock(nn.Module): diff --git a/comfy/ldm/wan/vae2_2.py b/comfy/ldm/wan/vae2_2.py index b9c2d1a26..1f6d584a2 100644 --- a/comfy/ldm/wan/vae2_2.py +++ b/comfy/ldm/wan/vae2_2.py @@ -151,7 +151,7 @@ class ResidualBlock(nn.Module): ], dim=2, ) - x = layer(x, feat_cache[idx]) + x = layer(x, cache_list=feat_cache, cache_idx=idx) feat_cache[idx] = cache_x feat_idx[0] += 1 else: From bff60b5cfc10d1b037a95746226ac6698dc3e373 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Fri, 1 Aug 2025 20:03:22 -0400 Subject: [PATCH 086/759] ComfyUI version 0.3.48 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 20a2e892a..7b29e338d 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.47" +__version__ = "0.3.48" diff --git a/pyproject.toml b/pyproject.toml index 244fdd232..256677fad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.47" +version = "0.3.48" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 3dfefc88d00bde744b729b073058a57e149cddc1 Mon Sep 17 00:00:00 2001 From: Johnpaul Chiwetelu <49923152+Myestery@users.noreply.github.com> Date: Sat, 2 Aug 2025 03:02:06 +0100 Subject: [PATCH 087/759] API for Recently Used Items (#8792) * feat: add file creation time to model file metadata and user file info * fix linting --- app/model_manager.py | 21 ++++++++++++++++----- app/user_manager.py | 4 +++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/app/model_manager.py b/app/model_manager.py index 74d942fb8..ab36bca74 100644 --- a/app/model_manager.py +++ b/app/model_manager.py @@ -130,10 +130,21 @@ class ModelFileManager: for file_name in filenames: try: - relative_path = os.path.relpath(os.path.join(dirpath, file_name), directory) - result.append(relative_path) - except: - logging.warning(f"Warning: Unable to access {file_name}. Skipping this file.") + full_path = os.path.join(dirpath, file_name) + relative_path = os.path.relpath(full_path, directory) + + # Get file metadata + file_info = { + "name": relative_path, + "pathIndex": pathIndex, + "modified": os.path.getmtime(full_path), # Add modification time + "created": os.path.getctime(full_path), # Add creation time + "size": os.path.getsize(full_path) # Add file size + } + result.append(file_info) + + except Exception as e: + logging.warning(f"Warning: Unable to access {file_name}. Error: {e}. Skipping this file.") continue for d in subdirs: @@ -144,7 +155,7 @@ class ModelFileManager: logging.warning(f"Warning: Unable to access {path}. Skipping this path.") continue - return [{"name": f, "pathIndex": pathIndex} for f in result], dirs, time.perf_counter() + return result, dirs, time.perf_counter() def get_model_previews(self, filepath: str) -> list[str | BytesIO]: dirname = os.path.dirname(filepath) diff --git a/app/user_manager.py b/app/user_manager.py index d31da5b9b..0ec3e46ea 100644 --- a/app/user_manager.py +++ b/app/user_manager.py @@ -20,13 +20,15 @@ class FileInfo(TypedDict): path: str size: int modified: int + created: int def get_file_info(path: str, relative_to: str) -> FileInfo: return { "path": os.path.relpath(path, relative_to).replace(os.sep, '/'), "size": os.path.getsize(path), - "modified": os.path.getmtime(path) + "modified": os.path.getmtime(path), + "created": os.path.getctime(path) } From fbcc23945dc377c8623bbee6132f15a93ac0c84a Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sun, 3 Aug 2025 02:15:29 +0800 Subject: [PATCH 088/759] Update template to 0.1.47 (#9153) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3828c5b91..ffa7dce65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.45 +comfyui-workflow-templates==0.1.47 comfyui-embedded-docs==0.2.4 torch torchsde From 5f582a97572e87ebfa655d379e8c8f7611c0249f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 2 Aug 2025 12:00:13 -0700 Subject: [PATCH 089/759] Make sure all the conds are on the right device. (#9151) --- comfy/model_base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 6b7978949..3ff8106d7 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -106,10 +106,12 @@ def model_sampling(model_config, model_type): return ModelSampling(model_config) -def convert_tensor(extra, dtype): +def convert_tensor(extra, dtype, device): if hasattr(extra, "dtype"): if extra.dtype != torch.int and extra.dtype != torch.long: - extra = extra.to(dtype) + extra = extra.to(dtype=dtype, device=device) + else: + extra = extra.to(device=device) return extra @@ -174,15 +176,16 @@ class BaseModel(torch.nn.Module): context = context.to(dtype) extra_conds = {} + device = xc.device for o in kwargs: extra = kwargs[o] if hasattr(extra, "dtype"): - extra = convert_tensor(extra, dtype) + extra = convert_tensor(extra, dtype, device) elif isinstance(extra, list): ex = [] for ext in extra: - ex.append(convert_tensor(ext, dtype)) + ex.append(convert_tensor(ext, dtype, device)) extra = ex extra_conds[o] = extra From 13aaa66ec21c397240a9b972d818430b39112588 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 2 Aug 2025 12:09:23 -0700 Subject: [PATCH 090/759] Make sure context is on the right device. (#9154) --- comfy/model_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 3ff8106d7..4556ee138 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -171,12 +171,12 @@ class BaseModel(torch.nn.Module): dtype = self.manual_cast_dtype xc = xc.to(dtype) + device = xc.device t = self.model_sampling.timestep(t).float() if context is not None: - context = context.to(dtype) + context = context.to(dtype=dtype, device=device) extra_conds = {} - device = xc.device for o in kwargs: extra = kwargs[o] From aebac221937b511d46fe601656acdc753435b849 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 3 Aug 2025 04:08:11 -0700 Subject: [PATCH 091/759] Cleanup. (#9160) --- comfy/controlnet.py | 1 - 1 file changed, 1 deletion(-) diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 9a47b86f2..6ed8bd756 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -43,7 +43,6 @@ if TYPE_CHECKING: def broadcast_image_to(tensor, target_batch_size, batched_number): current_batch_size = tensor.shape[0] - #print(current_batch_size, target_batch_size) if current_batch_size == 1: return tensor From 182f90b5eca2baa25474223759039925b286d562 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 4 Aug 2025 00:11:53 -0700 Subject: [PATCH 092/759] Lower cond vram use by casting at the same time as device transfer. (#9159) --- comfy/conds.py | 14 +++++++------- comfy/model_base.py | 6 +++--- comfy/samplers.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/comfy/conds.py b/comfy/conds.py index 2af2a43a3..f2564e7ef 100644 --- a/comfy/conds.py +++ b/comfy/conds.py @@ -10,8 +10,8 @@ class CONDRegular: def _copy_with(self, cond): return self.__class__(cond) - def process_cond(self, batch_size, device, **kwargs): - return self._copy_with(comfy.utils.repeat_to_batch_size(self.cond, batch_size).to(device)) + def process_cond(self, batch_size, **kwargs): + return self._copy_with(comfy.utils.repeat_to_batch_size(self.cond, batch_size)) def can_concat(self, other): if self.cond.shape != other.cond.shape: @@ -29,14 +29,14 @@ class CONDRegular: class CONDNoiseShape(CONDRegular): - def process_cond(self, batch_size, device, area, **kwargs): + def process_cond(self, batch_size, area, **kwargs): data = self.cond if area is not None: dims = len(area) // 2 for i in range(dims): data = data.narrow(i + 2, area[i + dims], area[i]) - return self._copy_with(comfy.utils.repeat_to_batch_size(data, batch_size).to(device)) + return self._copy_with(comfy.utils.repeat_to_batch_size(data, batch_size)) class CONDCrossAttn(CONDRegular): @@ -73,7 +73,7 @@ class CONDConstant(CONDRegular): def __init__(self, cond): self.cond = cond - def process_cond(self, batch_size, device, **kwargs): + def process_cond(self, batch_size, **kwargs): return self._copy_with(self.cond) def can_concat(self, other): @@ -92,10 +92,10 @@ class CONDList(CONDRegular): def __init__(self, cond): self.cond = cond - def process_cond(self, batch_size, device, **kwargs): + def process_cond(self, batch_size, **kwargs): out = [] for c in self.cond: - out.append(comfy.utils.repeat_to_batch_size(c, batch_size).to(device)) + out.append(comfy.utils.repeat_to_batch_size(c, batch_size)) return self._copy_with(out) diff --git a/comfy/model_base.py b/comfy/model_base.py index 4556ee138..3a9c031ea 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -109,9 +109,9 @@ def model_sampling(model_config, model_type): def convert_tensor(extra, dtype, device): if hasattr(extra, "dtype"): if extra.dtype != torch.int and extra.dtype != torch.long: - extra = extra.to(dtype=dtype, device=device) + extra = comfy.model_management.cast_to_device(extra, device, dtype) else: - extra = extra.to(device=device) + extra = comfy.model_management.cast_to_device(extra, device, None) return extra @@ -174,7 +174,7 @@ class BaseModel(torch.nn.Module): device = xc.device t = self.model_sampling.timestep(t).float() if context is not None: - context = context.to(dtype=dtype, device=device) + context = comfy.model_management.cast_to_device(context, device, dtype) extra_conds = {} for o in kwargs: diff --git a/comfy/samplers.py b/comfy/samplers.py index e93d2a315..ad2f40cdc 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -89,7 +89,7 @@ def get_area_and_mult(conds, x_in, timestep_in): conditioning = {} model_conds = conds["model_conds"] for c in model_conds: - conditioning[c] = model_conds[c].process_cond(batch_size=x_in.shape[0], device=x_in.device, area=area) + conditioning[c] = model_conds[c].process_cond(batch_size=x_in.shape[0], area=area) hooks = conds.get('hooks', None) control = conds.get('control', None) From 140ffc7fdc53e810030f060e421c1f528c2d2ab9 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 4 Aug 2025 00:28:12 -0700 Subject: [PATCH 093/759] Fix broken controlnet from last PR. (#9167) --- comfy/controlnet.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 6ed8bd756..988acdb57 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -28,6 +28,7 @@ import comfy.model_detection import comfy.model_patcher import comfy.ops import comfy.latent_formats +import comfy.model_base import comfy.cldm.cldm import comfy.t2i_adapter.adapter @@ -264,12 +265,12 @@ class ControlNet(ControlBase): for c in self.extra_conds: temp = cond.get(c, None) if temp is not None: - extra[c] = temp.to(dtype) + extra[c] = comfy.model_base.convert_tensor(temp, dtype, x_noisy.device) timestep = self.model_sampling_current.timestep(t) x_noisy = self.model_sampling_current.calculate_input(t, x_noisy) - control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.to(dtype), context=context.to(dtype), **extra) + control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.to(dtype), context=comfy.model_management.cast_to_device(context, x_noisy.device, dtype), **extra) return self.control_merge(control, control_prev, output_dtype=None) def copy(self): From 7991341e89cab521441641505ac4b0eea292a829 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 4 Aug 2025 01:02:40 -0700 Subject: [PATCH 094/759] Various fixes for broken things from earlier PR. (#9168) --- comfy/model_base.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 3a9c031ea..f9591f292 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -401,7 +401,7 @@ class SD21UNCLIP(BaseModel): unclip_conditioning = kwargs.get("unclip_conditioning", None) device = kwargs["device"] if unclip_conditioning is None: - return torch.zeros((1, self.adm_channels)) + return torch.zeros((1, self.adm_channels), device=device) else: return unclip_adm(unclip_conditioning, device, self.noise_augmentor, kwargs.get("unclip_noise_augment_merge", 0.05), kwargs.get("seed", 0) - 10) @@ -409,7 +409,7 @@ def sdxl_pooled(args, noise_augmentor): if "unclip_conditioning" in args: return unclip_adm(args.get("unclip_conditioning", None), args["device"], noise_augmentor, seed=args.get("seed", 0) - 10)[:,:1280] else: - return args["pooled_output"] + return args["pooled_output"].to(device=args["device"]) class SDXLRefiner(BaseModel): def __init__(self, model_config, model_type=ModelType.EPS, device=None): @@ -615,9 +615,11 @@ class IP2P: if image is None: image = torch.zeros_like(noise) + else: + image = image.to(device=device) if image.shape[1:] != noise.shape[1:]: - image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") + image = utils.common_upscale(image, noise.shape[-1], noise.shape[-2], "bilinear", "center") image = utils.resize_to_batch_size(image, noise.shape[0]) return self.process_ip2p_image_in(image) @@ -696,7 +698,7 @@ class StableCascade_B(BaseModel): #size of prior doesn't really matter if zeros because it gets resized but I still want it to get batched prior = kwargs.get("stable_cascade_prior", torch.zeros((1, 16, (noise.shape[2] * 4) // 42, (noise.shape[3] * 4) // 42), dtype=noise.dtype, layout=noise.layout, device=noise.device)) - out["effnet"] = comfy.conds.CONDRegular(prior) + out["effnet"] = comfy.conds.CONDRegular(prior.to(device=noise.device)) out["sca"] = comfy.conds.CONDRegular(torch.zeros((1,))) return out @@ -1161,10 +1163,10 @@ class WAN21_Vace(WAN21): vace_frames_out = [] for j in range(len(vace_frames)): - vf = vace_frames[j].clone() + vf = vace_frames[j].to(device=noise.device, dtype=noise.dtype, copy=True) for i in range(0, vf.shape[1], 16): vf[:, i:i + 16] = self.process_latent_in(vf[:, i:i + 16]) - vf = torch.cat([vf, mask[j]], dim=1) + vf = torch.cat([vf, mask[j].to(device=noise.device, dtype=noise.dtype)], dim=1) vace_frames_out.append(vf) vace_frames = torch.stack(vace_frames_out, dim=1) From 84f9759424ccbd8de710960c79f0f1d28eef2776 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 4 Aug 2025 01:20:12 -0700 Subject: [PATCH 095/759] Add some warnings and prevent crash when cond devices don't match. (#9169) --- comfy/conds.py | 7 +++++++ comfy/model_base.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/comfy/conds.py b/comfy/conds.py index f2564e7ef..5af3e93ea 100644 --- a/comfy/conds.py +++ b/comfy/conds.py @@ -1,6 +1,7 @@ import torch import math import comfy.utils +import logging class CONDRegular: @@ -16,6 +17,9 @@ class CONDRegular: def can_concat(self, other): if self.cond.shape != other.cond.shape: return False + if self.cond.device != other.cond.device: + logging.warning("WARNING: conds not on same device, skipping concat.") + return False return True def concat(self, others): @@ -51,6 +55,9 @@ class CONDCrossAttn(CONDRegular): diff = mult_min // min(s1[1], s2[1]) if diff > 4: #arbitrary limit on the padding because it's probably going to impact performance negatively if it's too much return False + if self.cond.device != other.cond.device: + logging.warning("WARNING: conds not on same device: skipping concat.") + return False return True def concat(self, others): diff --git a/comfy/model_base.py b/comfy/model_base.py index f9591f292..2db81e244 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -409,7 +409,7 @@ def sdxl_pooled(args, noise_augmentor): if "unclip_conditioning" in args: return unclip_adm(args.get("unclip_conditioning", None), args["device"], noise_augmentor, seed=args.get("seed", 0) - 10)[:,:1280] else: - return args["pooled_output"].to(device=args["device"]) + return args["pooled_output"] class SDXLRefiner(BaseModel): def __init__(self, model_config, model_type=ModelType.EPS, device=None): From 03895dea7c4a6cc93fa362cd11ca450217d74b13 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 4 Aug 2025 01:33:04 -0700 Subject: [PATCH 096/759] Fix another issue with the PR. (#9170) --- comfy/model_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 2db81e244..a06686436 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -162,7 +162,7 @@ class BaseModel(torch.nn.Module): xc = self.model_sampling.calculate_input(sigma, x) if c_concat is not None: - xc = torch.cat([xc] + [c_concat], dim=1) + xc = torch.cat([xc] + [comfy.model_management.cast_to_device(c_concat, xc.device, xc.dtype)], dim=1) context = c_crossattn dtype = self.get_dtype() From c012400240d4867cd63a45220eb791b91ad47617 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 4 Aug 2025 19:53:25 -0700 Subject: [PATCH 097/759] Initial support for qwen image model. (#9179) --- comfy/ldm/qwen_image/model.py | 399 ++++++++++++++++++++++++++++++ comfy/model_base.py | 12 + comfy/model_detection.py | 7 +- comfy/sd.py | 12 +- comfy/supported_models.py | 32 ++- comfy/text_encoders/llama.py | 26 ++ comfy/text_encoders/qwen_image.py | 71 ++++++ nodes.py | 2 +- 8 files changed, 557 insertions(+), 4 deletions(-) create mode 100644 comfy/ldm/qwen_image/model.py create mode 100644 comfy/text_encoders/qwen_image.py diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py new file mode 100644 index 000000000..ff631a60f --- /dev/null +++ b/comfy/ldm/qwen_image/model.py @@ -0,0 +1,399 @@ +# https://github.com/QwenLM/Qwen-Image (Apache 2.0) +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Optional, Tuple +from einops import repeat + +from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps +from comfy.ldm.modules.attention import optimized_attention_masked +from comfy.ldm.flux.layers import EmbedND + + +class GELU(nn.Module): + def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True, dtype=None, device=None, operations=None): + super().__init__() + self.proj = operations.Linear(dim_in, dim_out, bias=bias, dtype=dtype, device=device) + self.approximate = approximate + + def forward(self, hidden_states): + hidden_states = self.proj(hidden_states) + hidden_states = F.gelu(hidden_states, approximate=self.approximate) + return hidden_states + + +class FeedForward(nn.Module): + def __init__( + self, + dim: int, + dim_out: Optional[int] = None, + mult: int = 4, + dropout: float = 0.0, + inner_dim=None, + bias: bool = True, + dtype=None, device=None, operations=None + ): + super().__init__() + if inner_dim is None: + inner_dim = int(dim * mult) + dim_out = dim_out if dim_out is not None else dim + + self.net = nn.ModuleList([]) + self.net.append(GELU(dim, inner_dim, approximate="tanh", bias=bias, dtype=dtype, device=device, operations=operations)) + self.net.append(nn.Dropout(dropout)) + self.net.append(operations.Linear(inner_dim, dim_out, bias=bias, dtype=dtype, device=device)) + + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: + for module in self.net: + hidden_states = module(hidden_states) + return hidden_states + + +def apply_rotary_emb(x, freqs_cis): + if x.shape[1] == 0: + return x + + t_ = x.reshape(*x.shape[:-1], -1, 1, 2) + t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1] + return t_out.reshape(*x.shape) + + +class QwenTimestepProjEmbeddings(nn.Module): + def __init__(self, embedding_dim, pooled_projection_dim, dtype=None, device=None, operations=None): + super().__init__() + self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000) + self.timestep_embedder = TimestepEmbedding( + in_channels=256, + time_embed_dim=embedding_dim, + dtype=dtype, + device=device, + operations=operations + ) + + def forward(self, timestep, hidden_states): + timesteps_proj = self.time_proj(timestep) + timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype)) + return timesteps_emb + + +class Attention(nn.Module): + def __init__( + self, + query_dim: int, + dim_head: int = 64, + heads: int = 8, + dropout: float = 0.0, + bias: bool = False, + eps: float = 1e-5, + out_bias: bool = True, + out_dim: int = None, + out_context_dim: int = None, + dtype=None, + device=None, + operations=None + ): + super().__init__() + self.inner_dim = out_dim if out_dim is not None else dim_head * heads + self.inner_kv_dim = self.inner_dim + self.heads = heads + self.dim_head = dim_head + self.out_dim = out_dim if out_dim is not None else query_dim + self.out_context_dim = out_context_dim if out_context_dim is not None else query_dim + self.dropout = dropout + + # Q/K normalization + self.norm_q = operations.RMSNorm(dim_head, eps=eps, elementwise_affine=True, dtype=dtype, device=device) + self.norm_k = operations.RMSNorm(dim_head, eps=eps, elementwise_affine=True, dtype=dtype, device=device) + self.norm_added_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device) + self.norm_added_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device) + + # Image stream projections + self.to_q = operations.Linear(query_dim, self.inner_dim, bias=bias, dtype=dtype, device=device) + self.to_k = operations.Linear(query_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device) + self.to_v = operations.Linear(query_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device) + + # Text stream projections + self.add_q_proj = operations.Linear(query_dim, self.inner_dim, bias=bias, dtype=dtype, device=device) + self.add_k_proj = operations.Linear(query_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device) + self.add_v_proj = operations.Linear(query_dim, self.inner_kv_dim, bias=bias, dtype=dtype, device=device) + + # Output projections + self.to_out = nn.ModuleList([ + operations.Linear(self.inner_dim, self.out_dim, bias=out_bias, dtype=dtype, device=device), + nn.Dropout(dropout) + ]) + self.to_add_out = operations.Linear(self.inner_dim, self.out_context_dim, bias=out_bias, dtype=dtype, device=device) + + def forward( + self, + hidden_states: torch.FloatTensor, # Image stream + encoder_hidden_states: torch.FloatTensor = None, # Text stream + encoder_hidden_states_mask: torch.FloatTensor = None, + attention_mask: Optional[torch.FloatTensor] = None, + image_rotary_emb: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + seq_txt = encoder_hidden_states.shape[1] + + img_query = self.to_q(hidden_states).unflatten(-1, (self.heads, -1)) + img_key = self.to_k(hidden_states).unflatten(-1, (self.heads, -1)) + img_value = self.to_v(hidden_states).unflatten(-1, (self.heads, -1)) + + txt_query = self.add_q_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1)) + txt_key = self.add_k_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1)) + txt_value = self.add_v_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1)) + + img_query = self.norm_q(img_query) + img_key = self.norm_k(img_key) + txt_query = self.norm_added_q(txt_query) + txt_key = self.norm_added_k(txt_key) + + joint_query = torch.cat([txt_query, img_query], dim=1) + joint_key = torch.cat([txt_key, img_key], dim=1) + joint_value = torch.cat([txt_value, img_value], dim=1) + + joint_query = apply_rotary_emb(joint_query, image_rotary_emb) + joint_key = apply_rotary_emb(joint_key, image_rotary_emb) + + joint_query = joint_query.flatten(start_dim=2) + joint_key = joint_key.flatten(start_dim=2) + joint_value = joint_value.flatten(start_dim=2) + + joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, attention_mask) + + txt_attn_output = joint_hidden_states[:, :seq_txt, :] + img_attn_output = joint_hidden_states[:, seq_txt:, :] + + img_attn_output = self.to_out[0](img_attn_output) + img_attn_output = self.to_out[1](img_attn_output) + txt_attn_output = self.to_add_out(txt_attn_output) + + return img_attn_output, txt_attn_output + + +class QwenImageTransformerBlock(nn.Module): + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + eps: float = 1e-6, + dtype=None, + device=None, + operations=None + ): + super().__init__() + self.dim = dim + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + + self.img_mod = nn.Sequential( + nn.SiLU(), + operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device), + ) + self.img_norm1 = operations.LayerNorm(dim, elementwise_affine=False, eps=eps, dtype=dtype, device=device) + self.img_norm2 = operations.LayerNorm(dim, elementwise_affine=False, eps=eps, dtype=dtype, device=device) + self.img_mlp = FeedForward(dim=dim, dim_out=dim, dtype=dtype, device=device, operations=operations) + + self.txt_mod = nn.Sequential( + nn.SiLU(), + operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device), + ) + self.txt_norm1 = operations.LayerNorm(dim, elementwise_affine=False, eps=eps, dtype=dtype, device=device) + self.txt_norm2 = operations.LayerNorm(dim, elementwise_affine=False, eps=eps, dtype=dtype, device=device) + self.txt_mlp = FeedForward(dim=dim, dim_out=dim, dtype=dtype, device=device, operations=operations) + + self.attn = Attention( + query_dim=dim, + dim_head=attention_head_dim, + heads=num_attention_heads, + out_dim=dim, + bias=True, + eps=eps, + dtype=dtype, + device=device, + operations=operations, + ) + + def _modulate(self, x, mod_params): + shift, scale, gate = mod_params.chunk(3, dim=-1) + return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + encoder_hidden_states_mask: torch.Tensor, + temb: torch.Tensor, + image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + img_mod_params = self.img_mod(temb) + txt_mod_params = self.txt_mod(temb) + img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1) + txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1) + + img_normed = self.img_norm1(hidden_states) + img_modulated, img_gate1 = self._modulate(img_normed, img_mod1) + txt_normed = self.txt_norm1(encoder_hidden_states) + txt_modulated, txt_gate1 = self._modulate(txt_normed, txt_mod1) + + img_attn_output, txt_attn_output = self.attn( + hidden_states=img_modulated, + encoder_hidden_states=txt_modulated, + encoder_hidden_states_mask=encoder_hidden_states_mask, + image_rotary_emb=image_rotary_emb, + ) + + hidden_states = hidden_states + img_gate1 * img_attn_output + encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output + + img_normed2 = self.img_norm2(hidden_states) + img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2) + hidden_states = hidden_states + img_gate2 * self.img_mlp(img_modulated2) + + txt_normed2 = self.txt_norm2(encoder_hidden_states) + txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2) + encoder_hidden_states = encoder_hidden_states + txt_gate2 * self.txt_mlp(txt_modulated2) + + return encoder_hidden_states, hidden_states + + +class LastLayer(nn.Module): + def __init__( + self, + embedding_dim: int, + conditioning_embedding_dim: int, + elementwise_affine=False, + eps=1e-6, + bias=True, + dtype=None, device=None, operations=None + ): + super().__init__() + self.silu = nn.SiLU() + self.linear = operations.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias, dtype=dtype, device=device) + self.norm = operations.LayerNorm(embedding_dim, eps, elementwise_affine=False, bias=bias, dtype=dtype, device=device) + + def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: + emb = self.linear(self.silu(conditioning_embedding)) + scale, shift = torch.chunk(emb, 2, dim=1) + x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :] + return x + + +class QwenImageTransformer2DModel(nn.Module): + def __init__( + self, + patch_size: int = 2, + in_channels: int = 64, + out_channels: Optional[int] = 16, + num_layers: int = 60, + attention_head_dim: int = 128, + num_attention_heads: int = 24, + joint_attention_dim: int = 3584, + pooled_projection_dim: int = 768, + guidance_embeds: bool = False, + axes_dims_rope: Tuple[int, int, int] = (16, 56, 56), + image_model=None, + dtype=None, + device=None, + operations=None, + ): + super().__init__() + self.dtype = dtype + self.patch_size = patch_size + self.out_channels = out_channels or in_channels + self.inner_dim = num_attention_heads * attention_head_dim + + self.pe_embedder = EmbedND(dim=attention_head_dim, theta=10000, axes_dim=list(axes_dims_rope)) + + self.time_text_embed = QwenTimestepProjEmbeddings( + embedding_dim=self.inner_dim, + pooled_projection_dim=pooled_projection_dim, + dtype=dtype, + device=device, + operations=operations + ) + + self.txt_norm = operations.RMSNorm(joint_attention_dim, eps=1e-6, dtype=dtype, device=device) + self.img_in = operations.Linear(in_channels, self.inner_dim, dtype=dtype, device=device) + self.txt_in = operations.Linear(joint_attention_dim, self.inner_dim, dtype=dtype, device=device) + + self.transformer_blocks = nn.ModuleList([ + QwenImageTransformerBlock( + dim=self.inner_dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + dtype=dtype, + device=device, + operations=operations + ) + for _ in range(num_layers) + ]) + + self.norm_out = LastLayer(self.inner_dim, self.inner_dim, dtype=dtype, device=device, operations=operations) + self.proj_out = operations.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, dtype=dtype, device=device) + self.gradient_checkpointing = False + + def pos_embeds(self, x, context): + bs, c, t, h, w = x.shape + patch_size = self.patch_size + h_len = ((h + (patch_size // 2)) // patch_size) + w_len = ((w + (patch_size // 2)) // patch_size) + + img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) + img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) + img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) + img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs) + + txt_start = round(max(h_len, w_len)) + txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(bs, 1, 3) + ids = torch.cat((txt_ids, img_ids), dim=1) + return self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) + + def forward( + self, + x, + timesteps, + context, + attention_mask=None, + guidance: torch.Tensor = None, + **kwargs + ): + timestep = timesteps + encoder_hidden_states = context + encoder_hidden_states_mask = attention_mask + + image_rotary_emb = self.pos_embeds(x, context) + + orig_shape = x.shape + hidden_states = x.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) + hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5) + hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4) + + hidden_states = self.img_in(hidden_states) + encoder_hidden_states = self.txt_norm(encoder_hidden_states) + encoder_hidden_states = self.txt_in(encoder_hidden_states) + + if guidance is not None: + guidance = guidance * 1000 + + temb = ( + self.time_text_embed(timestep, hidden_states) + if guidance is None + else self.time_text_embed(timestep, guidance, hidden_states) + ) + + for block in self.transformer_blocks: + encoder_hidden_states, hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + encoder_hidden_states_mask=encoder_hidden_states_mask, + temb=temb, + image_rotary_emb=image_rotary_emb, + ) + + hidden_states = self.norm_out(hidden_states, temb) + hidden_states = self.proj_out(hidden_states) + + hidden_states = hidden_states.view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2) + hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5) + return hidden_states.reshape(orig_shape) diff --git a/comfy/model_base.py b/comfy/model_base.py index a06686436..8a2d9cbe6 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -42,6 +42,7 @@ import comfy.ldm.hidream.model import comfy.ldm.chroma.model import comfy.ldm.ace.model import comfy.ldm.omnigen.omnigen2 +import comfy.ldm.qwen_image.model import comfy.model_management import comfy.patcher_extension @@ -1308,3 +1309,14 @@ class Omnigen2(BaseModel): if ref_latents is not None: out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16]) return out + +class QwenImage(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLUX, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.qwen_image.model.QwenImageTransformer2DModel) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + return out diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 9fc1f42de..8b57ebd2f 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -481,6 +481,11 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["timestep_scale"] = 1000.0 return dit_config + if '{}txt_norm.weight'.format(key_prefix) in state_dict_keys: # Qwen Image + dit_config = {} + dit_config["image_model"] = "qwen_image" + return dit_config + if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys: return None @@ -867,7 +872,7 @@ def convert_diffusers_mmdit(state_dict, output_prefix=""): depth_single_blocks = count_blocks(state_dict, 'single_transformer_blocks.{}.') hidden_size = state_dict["x_embedder.bias"].shape[0] sd_map = comfy.utils.flux_to_diffusers({"depth": depth, "depth_single_blocks": depth_single_blocks, "hidden_size": hidden_size}, output_prefix=output_prefix) - elif 'transformer_blocks.0.attn.add_q_proj.weight' in state_dict: #SD3 + elif 'transformer_blocks.0.attn.add_q_proj.weight' in state_dict and 'pos_embed.proj.weight' in state_dict: #SD3 num_blocks = count_blocks(state_dict, 'transformer_blocks.{}.') depth = state_dict["pos_embed.proj.weight"].shape[0] // 64 sd_map = comfy.utils.mmdit_to_diffusers({"depth": depth, "num_blocks": num_blocks}, output_prefix=output_prefix) diff --git a/comfy/sd.py b/comfy/sd.py index e0498e585..bb5d61fb3 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -47,6 +47,7 @@ import comfy.text_encoders.wan import comfy.text_encoders.hidream import comfy.text_encoders.ace import comfy.text_encoders.omnigen2 +import comfy.text_encoders.qwen_image import comfy.model_patcher import comfy.lora @@ -771,6 +772,7 @@ class CLIPType(Enum): CHROMA = 15 ACE = 16 OMNIGEN2 = 17 + QWEN_IMAGE = 18 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}): @@ -791,6 +793,7 @@ class TEModel(Enum): T5_XXL_OLD = 8 GEMMA_2_2B = 9 QWEN25_3B = 10 + QWEN25_7B = 11 def detect_te_model(sd): if "text_model.encoder.layers.30.mlp.fc1.weight" in sd: @@ -812,7 +815,11 @@ def detect_te_model(sd): if 'model.layers.0.post_feedforward_layernorm.weight' in sd: return TEModel.GEMMA_2_2B if 'model.layers.0.self_attn.k_proj.bias' in sd: - return TEModel.QWEN25_3B + weight = sd['model.layers.0.self_attn.k_proj.bias'] + if weight.shape[0] == 256: + return TEModel.QWEN25_3B + if weight.shape[0] == 512: + return TEModel.QWEN25_7B if "model.layers.0.post_attention_layernorm.weight" in sd: return TEModel.LLAMA3_8 return None @@ -917,6 +924,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip elif te_model == TEModel.QWEN25_3B: clip_target.clip = comfy.text_encoders.omnigen2.te(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.omnigen2.Omnigen2Tokenizer + elif te_model == TEModel.QWEN25_7B: + clip_target.clip = comfy.text_encoders.qwen_image.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.qwen_image.QwenImageTokenizer else: # clip_l if clip_type == CLIPType.SD3: diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 8f3f4652d..880055bd3 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -19,6 +19,7 @@ import comfy.text_encoders.lumina2 import comfy.text_encoders.wan import comfy.text_encoders.ace import comfy.text_encoders.omnigen2 +import comfy.text_encoders.qwen_image from . import supported_models_base from . import latent_formats @@ -1229,7 +1230,36 @@ class Omnigen2(supported_models_base.BASE): hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect)) +class QwenImage(supported_models_base.BASE): + unet_config = { + "image_model": "qwen_image", + } -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2] + sampling_settings = { + "multiplier": 1.0, + "shift": 2.6, + } + + memory_usage_factor = 1.8 #TODO + + unet_extra_config = {} + latent_format = latent_formats.Wan21 + + supported_inference_dtypes = [torch.bfloat16, torch.float32] + + vae_key_prefix = ["vae."] + text_encoder_key_prefix = ["text_encoders."] + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.QwenImage(self, device=device) + return out + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) + return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect)) + + +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 7fbd0f604..1da6a0c94 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -43,6 +43,23 @@ class Qwen25_3BConfig: mlp_activation = "silu" qkv_bias = True +@dataclass +class Qwen25_7BVLI_Config: + vocab_size: int = 152064 + hidden_size: int = 3584 + intermediate_size: int = 18944 + num_hidden_layers: int = 28 + num_attention_heads: int = 28 + num_key_value_heads: int = 4 + max_position_embeddings: int = 128000 + rms_norm_eps: float = 1e-6 + rope_theta: float = 1000000.0 + transformer_type: str = "llama" + head_dim = 128 + rms_norm_add = False + mlp_activation = "silu" + qkv_bias = True + @dataclass class Gemma2_2B_Config: vocab_size: int = 256000 @@ -348,6 +365,15 @@ class Qwen25_3B(BaseLlama, torch.nn.Module): self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) self.dtype = dtype +class Qwen25_7BVLI(BaseLlama, torch.nn.Module): + def __init__(self, config_dict, dtype, device, operations): + super().__init__() + config = Qwen25_7BVLI_Config(**config_dict) + self.num_layers = config.num_hidden_layers + + self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) + self.dtype = dtype + class Gemma2_2B(BaseLlama, torch.nn.Module): def __init__(self, config_dict, dtype, device, operations): super().__init__() diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py new file mode 100644 index 000000000..ce5c98097 --- /dev/null +++ b/comfy/text_encoders/qwen_image.py @@ -0,0 +1,71 @@ +from transformers import Qwen2Tokenizer +from comfy import sd1_clip +import comfy.text_encoders.llama +import os +import torch +import numbers + +class Qwen25_7BVLITokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer") + super().__init__(tokenizer_path, pad_with_end=False, embedding_size=3584, embedding_key='qwen25_7b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data) + + +class QwenImageTokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_7b", tokenizer=Qwen25_7BVLITokenizer) + self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + + def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None,**kwargs): + if llama_template is None: + llama_text = self.llama_template.format(text) + else: + llama_text = llama_template.format(text) + return super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs) + + +class Qwen25_7BVLIModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}): + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) + + +class QwenImageTEModel(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + super().__init__(device=device, dtype=dtype, name="qwen25_7b", clip_model=Qwen25_7BVLIModel, model_options=model_options) + + def encode_token_weights(self, token_weight_pairs): + out, pooled, extra = super().encode_token_weights(token_weight_pairs) + tok_pairs = token_weight_pairs["qwen25_7b"][0] + count_im_start = 0 + for i, v in enumerate(tok_pairs): + elem = v[0] + if not torch.is_tensor(elem): + if isinstance(elem, numbers.Integral): + if elem == 151644 and count_im_start < 2: + template_end = i + count_im_start += 1 + + if out.shape[1] > (template_end + 3): + if tok_pairs[template_end + 1][0] == 872: + if tok_pairs[template_end + 2][0] == 198: + template_end += 3 + + out = out[:, template_end:] + + extra["attention_mask"] = extra["attention_mask"][:, template_end:] + if extra["attention_mask"].sum() == torch.numel(extra["attention_mask"]): + extra.pop("attention_mask") # attention mask is useless if no masked elements + + return out, pooled, extra + + +def te(dtype_llama=None, llama_scaled_fp8=None): + class QwenImageTEModel_(QwenImageTEModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: + model_options = model_options.copy() + model_options["scaled_fp8"] = llama_scaled_fp8 + if dtype_llama is not None: + dtype = dtype_llama + super().__init__(device=device, dtype=dtype, model_options=model_options) + return QwenImageTEModel_ diff --git a/nodes.py b/nodes.py index da4a46366..9bedbcaca 100644 --- a/nodes.py +++ b/nodes.py @@ -925,7 +925,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), From f69609bbd6c20f4814e313f8974656b187a9bee2 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Mon, 4 Aug 2025 22:52:25 -0700 Subject: [PATCH 098/759] Add Veo3 video generation node with audio support (#9110) - Create new Veo3VideoGenerationNode that extends VeoVideoGenerationNode - Add support for generateAudio parameter (only for Veo3 models) - Support new Veo3 models: veo-3.0-generate-001, veo-3.0-fast-generate-001 - Fix Veo3 duration constraint to 8 seconds only - Update original node to be clearly Veo 2 only - Update API paths to use model parameter: /proxy/veo/{model}/generate - Regenerate API types from staging to include generateAudio parameter - Fix TripoModelVersion enum reference after regeneration - Mark generated API types file in .gitattributes --- .gitattributes | 1 + comfy_api_nodes/apis/__init__.py | 2656 ++++++++++++++++++++++++++++- comfy_api_nodes/apis/tripo_api.py | 2 +- comfy_api_nodes/nodes_veo2.py | 98 +- 4 files changed, 2664 insertions(+), 93 deletions(-) diff --git a/.gitattributes b/.gitattributes index 4391de678..5b3c15bb4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,3 @@ /web/assets/** linguist-generated /web/** linguist-vendored +comfy_api_nodes/apis/__init__.py linguist-generated diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index 086028abe..54298e8a9 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: filtered-openapi.yaml -# timestamp: 2025-07-06T09:47:31+00:00 +# timestamp: 2025-07-30T08:54:00+00:00 from __future__ import annotations @@ -37,6 +37,99 @@ class AuditLog(BaseModel): ) +class BFLAsyncResponse(BaseModel): + id: str = Field(..., title='Id') + polling_url: str = Field(..., title='Polling Url') + + +class BFLAsyncWebhookResponse(BaseModel): + id: str = Field(..., title='Id') + status: str = Field(..., title='Status') + webhook_url: str = Field(..., title='Webhook Url') + + +class CannyHighThreshold(RootModel[int]): + root: int = Field( + ..., + description='High threshold for Canny edge detection', + ge=0, + le=500, + title='Canny High Threshold', + ) + + +class CannyLowThreshold(RootModel[int]): + root: int = Field( + ..., + description='Low threshold for Canny edge detection', + ge=0, + le=500, + title='Canny Low Threshold', + ) + + +class Guidance(RootModel[float]): + root: float = Field( + ..., + description='Guidance strength for the image generation process', + ge=1.0, + le=100.0, + title='Guidance', + ) + + +class Steps(RootModel[int]): + root: int = Field( + ..., + description='Number of steps for the image generation process', + ge=15, + le=50, + title='Steps', + ) + + +class WebhookUrl(RootModel[AnyUrl]): + root: AnyUrl = Field( + ..., description='URL to receive webhook notifications', title='Webhook Url' + ) + + +class BFLFluxKontextMaxGenerateRequest(BaseModel): + guidance: Optional[float] = Field( + 3, description='The guidance scale for generation', ge=1.0, le=20.0 + ) + input_image: str = Field(..., description='Base64 encoded image to be edited') + prompt: str = Field( + ..., description='The text prompt describing what to edit on the image' + ) + steps: Optional[int] = Field( + 50, description='Number of inference steps', ge=1, le=50 + ) + + +class BFLFluxKontextMaxGenerateResponse(BaseModel): + id: str = Field(..., description='Job ID for tracking') + polling_url: str = Field(..., description='URL to poll for results') + + +class BFLFluxKontextProGenerateRequest(BaseModel): + guidance: Optional[float] = Field( + 3, description='The guidance scale for generation', ge=1.0, le=20.0 + ) + input_image: str = Field(..., description='Base64 encoded image to be edited') + prompt: str = Field( + ..., description='The text prompt describing what to edit on the image' + ) + steps: Optional[int] = Field( + 50, description='Number of inference steps', ge=1, le=50 + ) + + +class BFLFluxKontextProGenerateResponse(BaseModel): + id: str = Field(..., description='Job ID for tracking') + polling_url: str = Field(..., description='URL to poll for results') + + class OutputFormat(str, Enum): jpeg = 'jpeg' png = 'png' @@ -68,6 +161,67 @@ class BFLFluxPro11GenerateResponse(BaseModel): polling_url: str = Field(..., description='URL to poll for results') +class Bottom(RootModel[int]): + root: int = Field( + ..., + description='Number of pixels to expand at the bottom of the image', + ge=0, + le=2048, + title='Bottom', + ) + + +class Guidance2(RootModel[float]): + root: float = Field( + ..., + description='Guidance strength for the image generation process', + ge=1.5, + le=100.0, + title='Guidance', + ) + + +class Left(RootModel[int]): + root: int = Field( + ..., + description='Number of pixels to expand on the left side of the image', + ge=0, + le=2048, + title='Left', + ) + + +class Right(RootModel[int]): + root: int = Field( + ..., + description='Number of pixels to expand on the right side of the image', + ge=0, + le=2048, + title='Right', + ) + + +class Steps2(RootModel[int]): + root: int = Field( + ..., + description='Number of steps for the image generation process', + examples=[50], + ge=15, + le=50, + title='Steps', + ) + + +class Top(RootModel[int]): + root: int = Field( + ..., + description='Number of pixels to expand at the top of the image', + ge=0, + le=2048, + title='Top', + ) + + class BFLFluxProGenerateRequest(BaseModel): guidance_scale: Optional[float] = Field( None, description='The guidance scale for generation.', ge=1.0, le=20.0 @@ -96,7 +250,71 @@ class BFLFluxProGenerateResponse(BaseModel): polling_url: str = Field(..., description='URL to poll for the generation result.') +class BFLOutputFormat(str, Enum): + jpeg = 'jpeg' + png = 'png' + + +class BFLValidationError(BaseModel): + loc: List[Union[str, int]] = Field(..., title='Location') + msg: str = Field(..., title='Message') + type: str = Field(..., title='Error Type') + + class Status(str, Enum): + success = 'success' + not_found = 'not_found' + error = 'error' + + +class ClaimMyNodeRequest(BaseModel): + GH_TOKEN: str = Field( + ..., description='GitHub token to verify if the user owns the repo of the node' + ) + + +class ComfyNode(BaseModel): + category: Optional[str] = Field( + None, + description='UI category where the node is listed, used for grouping nodes.', + ) + comfy_node_name: Optional[str] = Field( + None, description='Unique identifier for the node' + ) + deprecated: Optional[bool] = Field( + None, + description='Indicates if the node is deprecated. Deprecated nodes are hidden in the UI.', + ) + description: Optional[str] = Field( + None, description="Brief description of the node's functionality or purpose." + ) + experimental: Optional[bool] = Field( + None, + description='Indicates if the node is experimental, subject to changes or removal.', + ) + function: Optional[str] = Field( + None, description='Name of the entry-point function to execute the node.' + ) + input_types: Optional[str] = Field(None, description='Defines input parameters') + output_is_list: Optional[List[bool]] = Field( + None, description='Boolean values indicating if each output is a list.' + ) + return_names: Optional[str] = Field( + None, description='Names of the outputs for clarity in workflows.' + ) + return_types: Optional[str] = Field( + None, description='Specifies the types of outputs produced by the node.' + ) + + +class ComfyNodeCloudBuildInfo(BaseModel): + build_id: Optional[str] = None + location: Optional[str] = None + project_id: Optional[str] = None + project_number: Optional[str] = None + + +class Status1(str, Enum): in_progress = 'in_progress' completed = 'completed' incomplete = 'incomplete' @@ -113,7 +331,7 @@ class ComputerToolCall(BaseModel): description='An identifier used when responding to the tool call with output.\n', ) id: str = Field(..., description='The unique ID of the computer call.') - status: Status = Field( + status: Status1 = Field( ..., description='The status of the item. One of `in_progress`, `completed`, or\n`incomplete`. Populated when items are returned via API.\n', ) @@ -156,6 +374,7 @@ class Customer(BaseModel): None, description='The date and time the user was created' ) email: Optional[str] = Field(None, description='The email address for this user') + has_fund: Optional[bool] = Field(None, description='Whether the user has funds') id: str = Field(..., description='The firebase UID of the user') is_admin: Optional[bool] = Field(None, description='Whether the user is an admin') metronome_id: Optional[str] = Field(None, description='The Metronome customer ID') @@ -194,6 +413,16 @@ class Type2(str, Enum): message = 'message' +class Error(BaseModel): + details: Optional[List[str]] = Field( + None, + description='Optional detailed information about the error or hints for resolving it.', + ) + message: Optional[str] = Field( + None, description='A clear and concise description of the error.' + ) + + class ErrorResponse(BaseModel): error: str message: str @@ -221,7 +450,7 @@ class Result(BaseModel): ) -class Status1(str, Enum): +class Status2(str, Enum): in_progress = 'in_progress' searching = 'searching' completed = 'completed' @@ -241,7 +470,7 @@ class FileSearchToolCall(BaseModel): results: Optional[List[Result]] = Field( None, description='The results of the file search tool call.\n' ) - status: Status1 = Field( + status: Status2 = Field( ..., description='The status of the file search tool call. One of `in_progress`, \n`searching`, `incomplete` or `failed`,\n', ) @@ -266,7 +495,7 @@ class FunctionTool(BaseModel): type: Literal['FunctionTool'] = Field(..., description='The type of tool') -class Status2(str, Enum): +class Status3(str, Enum): in_progress = 'in_progress' completed = 'completed' incomplete = 'incomplete' @@ -288,7 +517,7 @@ class FunctionToolCall(BaseModel): None, description='The unique ID of the function tool call.\n' ) name: str = Field(..., description='The name of the function to run.\n') - status: Optional[Status2] = Field( + status: Optional[Status3] = Field( None, description='The status of the item. One of `in_progress`, `completed`, or\n`incomplete`. Populated when items are returned via API.\n', ) @@ -442,6 +671,95 @@ class GeminiVideoMetadata(BaseModel): startOffset: Optional[GeminiOffset] = None +class GitCommitSummary(BaseModel): + author: Optional[str] = Field(None, description='The author of the commit') + branch_name: Optional[str] = Field( + None, description='The branch where the commit was made' + ) + commit_hash: Optional[str] = Field(None, description='The hash of the commit') + commit_name: Optional[str] = Field(None, description='The name of the commit') + status_summary: Optional[Dict[str, str]] = Field( + None, description='A map of operating system to status pairs' + ) + timestamp: Optional[datetime] = Field( + None, description='The timestamp when the commit was made' + ) + + +class GithubEnterprise(BaseModel): + avatar_url: str = Field(..., description='URL to the enterprise avatar') + created_at: datetime = Field(..., description='When the enterprise was created') + description: Optional[str] = Field(None, description='The enterprise description') + html_url: str = Field(..., description='The HTML URL of the enterprise') + id: int = Field(..., description='The enterprise ID') + name: str = Field(..., description='The enterprise name') + node_id: str = Field(..., description='The enterprise node ID') + slug: str = Field(..., description='The enterprise slug') + updated_at: datetime = Field( + ..., description='When the enterprise was last updated' + ) + website_url: Optional[str] = Field(None, description='The enterprise website URL') + + +class RepositorySelection(str, Enum): + selected = 'selected' + all = 'all' + + +class GithubOrganization(BaseModel): + avatar_url: str = Field(..., description="URL to the organization's avatar") + description: Optional[str] = Field(None, description='The organization description') + events_url: str = Field(..., description="The API URL of the organization's events") + hooks_url: str = Field(..., description="The API URL of the organization's hooks") + id: int = Field(..., description='The organization ID') + issues_url: str = Field(..., description="The API URL of the organization's issues") + login: str = Field(..., description="The organization's login name") + members_url: str = Field( + ..., description="The API URL of the organization's members" + ) + node_id: str = Field(..., description='The organization node ID') + public_members_url: str = Field( + ..., description="The API URL of the organization's public members" + ) + repos_url: str = Field( + ..., description="The API URL of the organization's repositories" + ) + url: str = Field(..., description='The API URL of the organization') + + +class State(str, Enum): + uploaded = 'uploaded' + open = 'open' + + +class Action(str, Enum): + published = 'published' + unpublished = 'unpublished' + created = 'created' + edited = 'edited' + deleted = 'deleted' + prereleased = 'prereleased' + released = 'released' + + +class Type7(str, Enum): + Bot = 'Bot' + User = 'User' + Organization = 'Organization' + + +class GithubUser(BaseModel): + avatar_url: str = Field(..., description="URL to the user's avatar") + gravatar_id: Optional[str] = Field(None, description="The user's gravatar ID") + html_url: str = Field(..., description='The HTML URL of the user') + id: int = Field(..., description="The user's ID") + login: str = Field(..., description="The user's login name") + node_id: str = Field(..., description="The user's node ID") + site_admin: bool = Field(..., description='Whether the user is a site admin') + type: Type7 = Field(..., description='The type of user') + url: str = Field(..., description='The API URL of the user') + + class IdeogramColorPalette1(BaseModel): name: str = Field(..., description='Name of the preset color palette') @@ -689,7 +1007,7 @@ class Includable(str, Enum): computer_call_output_output_image_url = 'computer_call_output.output.image_url' -class Type7(str, Enum): +class Type8(str, Enum): input_file = 'input_file' @@ -703,7 +1021,7 @@ class InputFileContent(BaseModel): filename: Optional[str] = Field( None, description='The name of the file to be sent to the model.' ) - type: Type7 = Field( + type: Type8 = Field( ..., description='The type of the input item. Always `input_file`.' ) @@ -714,7 +1032,7 @@ class Detail(str, Enum): auto = 'auto' -class Type8(str, Enum): +class Type9(str, Enum): input_image = 'input_image' @@ -730,7 +1048,7 @@ class InputImageContent(BaseModel): None, description='The URL of the image to be sent to the model. A fully qualified URL or base64 encoded image in a data URL.', ) - type: Type8 = Field( + type: Type9 = Field( ..., description='The type of the input item. Always `input_image`.' ) @@ -741,17 +1059,17 @@ class Role3(str, Enum): developer = 'developer' -class Type9(str, Enum): +class Type10(str, Enum): message = 'message' -class Type10(str, Enum): +class Type11(str, Enum): input_text = 'input_text' class InputTextContent(BaseModel): text: str = Field(..., description='The text input to the model.') - type: Type10 = Field( + type: Type11 = Field( ..., description='The type of the input item. Always `input_text`.' ) @@ -923,7 +1241,7 @@ class ResourcePackType(str, Enum): constant_period = 'constant_period' -class Status4(str, Enum): +class Status5(str, Enum): toBeOnline = 'toBeOnline' online = 'online' expired = 'expired' @@ -949,7 +1267,7 @@ class ResourcePackSubscribeInfo(BaseModel): None, description='Resource package type (decreasing_total=decreasing total, constant_period=constant periodicity)', ) - status: Optional[Status4] = Field(None, description='Resource Package Status') + status: Optional[Status5] = Field(None, description='Resource Package Status') total_quantity: Optional[float] = Field(None, description='Total quantity') @@ -1113,7 +1431,7 @@ class LumaError(BaseModel): detail: Optional[str] = Field(None, description='The error message') -class Type11(str, Enum): +class Type12(str, Enum): generation = 'generation' @@ -1153,7 +1471,7 @@ class LumaImageRef(BaseModel): ) -class Type12(str, Enum): +class Type13(str, Enum): image = 'image' @@ -1223,6 +1541,36 @@ class LumaVideoModelOutputResolution( root: Union[LumaVideoModelOutputResolution1, str] +class MachineStats(BaseModel): + cpu_capacity: Optional[str] = Field(None, description='Total CPU on the machine.') + disk_capacity: Optional[str] = Field( + None, description='Total disk capacity on the machine.' + ) + gpu_type: Optional[str] = Field( + None, description='The GPU type. eg. NVIDIA Tesla K80' + ) + initial_cpu: Optional[str] = Field( + None, description='Initial CPU available before the job starts.' + ) + initial_disk: Optional[str] = Field( + None, description='Initial disk available before the job starts.' + ) + initial_ram: Optional[str] = Field( + None, description='Initial RAM available before the job starts.' + ) + machine_name: Optional[str] = Field(None, description='Name of the machine.') + memory_capacity: Optional[str] = Field( + None, description='Total memory on the machine.' + ) + os_version: Optional[str] = Field( + None, description='The operating system version. eg. Ubuntu Linux 20.04' + ) + pip_freeze: Optional[str] = Field(None, description='The pip freeze output') + vram_time_series: Optional[Dict[str, Any]] = Field( + None, description='Time series of VRAM usage.' + ) + + class MinimaxBaseResponse(BaseModel): status_code: int = Field( ..., @@ -1251,7 +1599,7 @@ class MinimaxFileRetrieveResponse(BaseModel): file: File -class Status5(str, Enum): +class Status6(str, Enum): Queueing = 'Queueing' Preparing = 'Preparing' Processing = 'Processing' @@ -1265,7 +1613,7 @@ class MinimaxTaskResultResponse(BaseModel): None, description='After the task status changes to Success, this field returns the file ID corresponding to the generated video.', ) - status: Status5 = Field( + status: Status6 = Field( ..., description="Task status: 'Queueing' (in queue), 'Preparing' (task is preparing), 'Processing' (generating), 'Success' (task completed successfully), or 'Fail' (task failed).", ) @@ -1326,6 +1674,22 @@ class MinimaxVideoGenerationResponse(BaseModel): ) +class Modality(str, Enum): + MODALITY_UNSPECIFIED = 'MODALITY_UNSPECIFIED' + TEXT = 'TEXT' + IMAGE = 'IMAGE' + VIDEO = 'VIDEO' + AUDIO = 'AUDIO' + DOCUMENT = 'DOCUMENT' + + +class ModalityTokenCount(BaseModel): + modality: Optional[Modality] = None + tokenCount: Optional[int] = Field( + None, description='Number of tokens for the given modality.' + ) + + class Truncation(str, Enum): disabled = 'disabled' auto = 'auto' @@ -1391,13 +1755,13 @@ class MoonvalleyTextToVideoInferenceParams(BaseModel): 0, description='Index of the conditioning frame' ) cooldown_steps: Optional[int] = Field( - None, description='Number of cooldown steps (calculated based on num_frames)' + 75, description='Number of cooldown steps (calculated based on num_frames)' ) fps: Optional[int] = Field( 24, description='Frames per second of the generated video' ) guidance_scale: Optional[float] = Field( - 12.5, description='Guidance scale for generation control' + 10, description='Guidance scale for generation control' ) height: Optional[int] = Field( 1080, description='Height of the generated video in pixels' @@ -1421,7 +1785,7 @@ class MoonvalleyTextToVideoInferenceParams(BaseModel): True, description='Whether to use timestep transformation' ) warmup_steps: Optional[int] = Field( - None, description='Number of warmup steps (calculated based on num_frames)' + 0, description='Number of warmup steps (calculated based on num_frames)' ) width: Optional[int] = Field( 1920, description='Width of the generated video in pixels' @@ -1463,10 +1827,10 @@ class MoonvalleyVideoToVideoInferenceParams(BaseModel): 0, description='Index of the conditioning frame' ) cooldown_steps: Optional[int] = Field( - None, description='Number of cooldown steps (calculated based on num_frames)' + 36, description='Number of cooldown steps (calculated based on num_frames)' ) guidance_scale: Optional[float] = Field( - 12.5, description='Guidance scale for generation control' + 15, description='Guidance scale for generation control' ) negative_prompt: Optional[str] = Field(None, description='Negative prompt text') seed: Optional[int] = Field( @@ -1486,7 +1850,7 @@ class MoonvalleyVideoToVideoInferenceParams(BaseModel): True, description='Whether to use timestep transformation' ) warmup_steps: Optional[int] = Field( - None, description='Number of warmup steps (calculated based on num_frames)' + 24, description='Number of warmup steps (calculated based on num_frames)' ) @@ -1507,6 +1871,34 @@ class MoonvalleyVideoToVideoRequest(BaseModel): ) +class NodeStatus(str, Enum): + NodeStatusActive = 'NodeStatusActive' + NodeStatusDeleted = 'NodeStatusDeleted' + NodeStatusBanned = 'NodeStatusBanned' + + +class NodeVersionIdentifier(BaseModel): + node_id: str = Field(..., description='The unique identifier of the node') + version: str = Field(..., description='The version of the node') + + +class NodeVersionStatus(str, Enum): + NodeVersionStatusActive = 'NodeVersionStatusActive' + NodeVersionStatusDeleted = 'NodeVersionStatusDeleted' + NodeVersionStatusBanned = 'NodeVersionStatusBanned' + NodeVersionStatusPending = 'NodeVersionStatusPending' + NodeVersionStatusFlagged = 'NodeVersionStatusFlagged' + + +class NodeVersionUpdateRequest(BaseModel): + changelog: Optional[str] = Field( + None, description='The changelog describing the version changes.' + ) + deprecated: Optional[bool] = Field( + None, description='Whether the version is deprecated.' + ) + + class Moderation(str, Enum): low = 'low' auto = 'auto' @@ -1723,38 +2115,57 @@ class Object(str, Enum): response = 'response' -class Status6(str, Enum): +class Status7(str, Enum): completed = 'completed' failed = 'failed' in_progress = 'in_progress' incomplete = 'incomplete' -class Type13(str, Enum): +class Type14(str, Enum): output_audio = 'output_audio' class OutputAudioContent(BaseModel): data: str = Field(..., description='Base64-encoded audio data') transcript: str = Field(..., description='Transcript of the audio') - type: Type13 = Field(..., description='The type of output content') + type: Type14 = Field(..., description='The type of output content') class Role4(str, Enum): assistant = 'assistant' -class Type14(str, Enum): +class Type15(str, Enum): message = 'message' -class Type15(str, Enum): +class Type16(str, Enum): output_text = 'output_text' class OutputTextContent(BaseModel): text: str = Field(..., description='The text content') - type: Type15 = Field(..., description='The type of output content') + type: Type16 = Field(..., description='The type of output content') + + +class PersonalAccessToken(BaseModel): + createdAt: Optional[datetime] = Field( + None, description='[Output Only]The date and time the token was created.' + ) + description: Optional[str] = Field( + None, + description="Optional. A more detailed description of the token's intended use.", + ) + id: Optional[UUID] = Field(None, description='Unique identifier for the GitCommit') + name: Optional[str] = Field( + None, + description='Required. The name of the token. Can be a simple description.', + ) + token: Optional[str] = Field( + None, + description='[Output Only]. The personal access token. Only returned during creation.', + ) class AspectRatio1(RootModel[float]): @@ -1961,7 +2372,7 @@ class PixverseVideoResponse(BaseModel): Resp: Optional[Resp1] = None -class Status7(int, Enum): +class Status8(int, Enum): integer_1 = 1 integer_5 = 5 integer_6 = 6 @@ -1980,7 +2391,7 @@ class Resp2(BaseModel): resolution_ratio: Optional[int] = None seed: Optional[int] = None size: Optional[int] = None - status: Optional[Status7] = Field( + status: Optional[Status8] = Field( None, description='Video generation status codes:\n* 1 - Generation successful\n* 5 - Generating\n* 6 - Deleted\n* 7 - Contents moderation failed\n* 8 - Generation failed\n', ) @@ -1994,6 +2405,17 @@ class PixverseVideoResultResponse(BaseModel): Resp: Optional[Resp2] = None +class PublisherStatus(str, Enum): + PublisherStatusActive = 'PublisherStatusActive' + PublisherStatusBanned = 'PublisherStatusBanned' + + +class PublisherUser(BaseModel): + email: Optional[str] = Field(None, description='The email address for this user.') + id: Optional[str] = Field(None, description='The unique id for this user.') + name: Optional[str] = Field(None, description='The name for this user.') + + class RgbItem(RootModel[int]): root: int = Field(..., ge=0, le=255) @@ -2020,13 +2442,13 @@ class ReasoningEffort(str, Enum): high = 'high' -class Status8(str, Enum): +class Status9(str, Enum): in_progress = 'in_progress' completed = 'completed' incomplete = 'incomplete' -class Type16(str, Enum): +class Type17(str, Enum): summary_text = 'summary_text' @@ -2035,12 +2457,12 @@ class SummaryItem(BaseModel): ..., description='A short summary of the reasoning used by the model when generating\nthe response.\n', ) - type: Type16 = Field( + type: Type17 = Field( ..., description='The type of the object. Always `summary_text`.\n' ) -class Type17(str, Enum): +class Type18(str, Enum): reasoning = 'reasoning' @@ -2048,16 +2470,31 @@ class ReasoningItem(BaseModel): id: str = Field( ..., description='The unique identifier of the reasoning content.\n' ) - status: Optional[Status8] = Field( + status: Optional[Status9] = Field( None, description='The status of the item. One of `in_progress`, `completed`, or\n`incomplete`. Populated when items are returned via API.\n', ) summary: List[SummaryItem] = Field(..., description='Reasoning text contents.\n') - type: Type17 = Field( + type: Type18 = Field( ..., description='The type of the object. Always `reasoning`.\n' ) +class RecraftImageColor(BaseModel): + rgb: Optional[List[int]] = None + std: Optional[List[float]] = None + weight: Optional[float] = None + + +class RecraftImageFeatures(BaseModel): + nsfw_score: Optional[float] = None + + +class RecraftImageFormat(str, Enum): + webp = 'webp' + png = 'png' + + class Controls(BaseModel): artistic_level: Optional[int] = Field( None, @@ -2111,12 +2548,143 @@ class RecraftImageGenerationResponse(BaseModel): data: List[Datum3] = Field(..., description='Array of generated image information') +class RecraftImageStyle(str, Enum): + digital_illustration = 'digital_illustration' + icon = 'icon' + realistic_image = 'realistic_image' + vector_illustration = 'vector_illustration' + + +class RecraftImageSubStyle(str, Enum): + field_2d_art_poster = '2d_art_poster' + field_3d = '3d' + field_80s = '80s' + glow = 'glow' + grain = 'grain' + hand_drawn = 'hand_drawn' + infantile_sketch = 'infantile_sketch' + kawaii = 'kawaii' + pixel_art = 'pixel_art' + psychedelic = 'psychedelic' + seamless = 'seamless' + voxel = 'voxel' + watercolor = 'watercolor' + broken_line = 'broken_line' + colored_outline = 'colored_outline' + colored_shapes = 'colored_shapes' + colored_shapes_gradient = 'colored_shapes_gradient' + doodle_fill = 'doodle_fill' + doodle_offset_fill = 'doodle_offset_fill' + offset_fill = 'offset_fill' + outline = 'outline' + outline_gradient = 'outline_gradient' + uneven_fill = 'uneven_fill' + field_70s = '70s' + cartoon = 'cartoon' + doodle_line_art = 'doodle_line_art' + engraving = 'engraving' + flat_2 = 'flat_2' + kawaii_1 = 'kawaii' + line_art = 'line_art' + linocut = 'linocut' + seamless_1 = 'seamless' + b_and_w = 'b_and_w' + enterprise = 'enterprise' + hard_flash = 'hard_flash' + hdr = 'hdr' + motion_blur = 'motion_blur' + natural_light = 'natural_light' + studio_portrait = 'studio_portrait' + line_circuit = 'line_circuit' + field_2d_art_poster_2 = '2d_art_poster_2' + engraving_color = 'engraving_color' + flat_air_art = 'flat_air_art' + hand_drawn_outline = 'hand_drawn_outline' + handmade_3d = 'handmade_3d' + stickers_drawings = 'stickers_drawings' + plastic = 'plastic' + pictogram = 'pictogram' + + +class RecraftResponseFormat(str, Enum): + url = 'url' + b64_json = 'b64_json' + + +class RecraftTextLayoutItem(BaseModel): + bbox: List[List[float]] + text: str + + +class RecraftTransformModel(str, Enum): + refm1 = 'refm1' + recraft20b = 'recraft20b' + recraftv2 = 'recraftv2' + recraftv3 = 'recraftv3' + flux1_1pro = 'flux1_1pro' + flux1dev = 'flux1dev' + imagen3 = 'imagen3' + hidream_i1_dev = 'hidream_i1_dev' + + +class RecraftUserControls(BaseModel): + artistic_level: Optional[int] = None + background_color: Optional[RecraftImageColor] = None + colors: Optional[List[RecraftImageColor]] = None + no_text: Optional[bool] = None + + +class Attention(str, Enum): + low = 'low' + medium = 'medium' + high = 'high' + + +class Project(str, Enum): + comfyui = 'comfyui' + comfyui_frontend = 'comfyui_frontend' + desktop = 'desktop' + + +class ReleaseNote(BaseModel): + attention: Attention = Field( + ..., description='The attention level for this release' + ) + content: str = Field( + ..., description='The content of the release note in markdown format' + ) + id: int = Field(..., description='Unique identifier for the release note') + project: Project = Field( + ..., description='The project this release note belongs to' + ) + published_at: datetime = Field( + ..., description='When the release note was published' + ) + version: str = Field(..., description='The version of the release') + + class RenderingSpeed(str, Enum): BALANCED = 'BALANCED' TURBO = 'TURBO' QUALITY = 'QUALITY' +class Type19(str, Enum): + response_completed = 'response.completed' + + +class Type20(str, Enum): + response_content_part_added = 'response.content_part.added' + + +class Type21(str, Enum): + response_content_part_done = 'response.content_part.done' + + +class Type22(str, Enum): + response_created = 'response.created' + + class ResponseErrorCode(str, Enum): server_error = 'server_error' rate_limit_exceeded = 'rate_limit_exceeded' @@ -2138,12 +2706,27 @@ class ResponseErrorCode(str, Enum): image_file_not_found = 'image_file_not_found' -class Type18(str, Enum): +class Type23(str, Enum): + error = 'error' + + +class ResponseErrorEvent(BaseModel): + code: str = Field(..., description='The error code.\n') + message: str = Field(..., description='The error message.\n') + param: str = Field(..., description='The error parameter.\n') + type: Type23 = Field(..., description='The type of the event. Always `error`.\n') + + +class Type24(str, Enum): + response_failed = 'response.failed' + + +class Type25(str, Enum): json_object = 'json_object' class ResponseFormatJsonObject(BaseModel): - type: Type18 = Field( + type: Type25 = Field( ..., description='The type of response format being defined. Always `json_object`.', ) @@ -2156,16 +2739,32 @@ class ResponseFormatJsonSchemaSchema(BaseModel): ) -class Type19(str, Enum): +class Type26(str, Enum): text = 'text' class ResponseFormatText(BaseModel): - type: Type19 = Field( + type: Type26 = Field( ..., description='The type of response format being defined. Always `text`.' ) +class Type27(str, Enum): + response_in_progress = 'response.in_progress' + + +class Type28(str, Enum): + response_incomplete = 'response.incomplete' + + +class Type29(str, Enum): + response_output_item_added = 'response.output_item.added' + + +class Type30(str, Enum): + response_output_item_done = 'response.output_item.done' + + class Truncation1(str, Enum): auto = 'auto' disabled = 'disabled' @@ -2200,10 +2799,6 @@ class Rodin3DCheckStatusRequest(BaseModel): ) -class Rodin3DCheckStatusResponse(BaseModel): - pass - - class Rodin3DDownloadRequest(BaseModel): task_uuid: str = Field(..., description='Task UUID') @@ -2235,6 +2830,13 @@ class RodinResourceItem(BaseModel): url: Optional[str] = Field(None, description='Download url') +class RodinStatusOptions(str, Enum): + Done = 'Done' + Failed = 'Failed' + Generating = 'Generating' + Waiting = 'Waiting' + + class RodinTierType(str, Enum): Regular = 'Regular' Sketch = 'Sketch' @@ -2325,6 +2927,7 @@ class RunwayTextToImageAspectRatioEnum(str, Enum): field_1808_768 = '1808:768' field_2112_912 = '2112:912' + class Model4(str, Enum): gen4_image = 'gen4_image' @@ -2350,6 +2953,38 @@ class RunwayTextToImageResponse(BaseModel): id: Optional[str] = Field(None, description='Task ID') +class Name(str, Enum): + content_moderation = 'content_moderation' + + +class StabilityContentModerationResponse(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new) you file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: Name = Field( + ..., + description='Our content moderation system has flagged some part of your request and subsequently denied it. You were not charged for this request. While this may at times be frustrating, it is necessary to maintain the integrity of our platform and ensure a safe experience for all users. If you would like to provide feedback, please use the [Support Form](https://kb.stability.ai/knowledge-base/kb-tickets/new).', + ) + + +class StabilityCreativity(RootModel[float]): + root: float = Field( + ..., + description='Controls the likelihood of creating additional details not heavily conditioned by the init image.', + ge=0.2, + le=0.5, + ) + + class StabilityError(BaseModel): errors: List[str] = Field( ..., @@ -2371,7 +3006,17 @@ class StabilityError(BaseModel): ) -class Status9(str, Enum): +class StabilityGenerationID(RootModel[str]): + root: str = Field( + ..., + description='The `id` of a generation, typically used for async generations, that can be used to check the status of the generation or retrieve the result.', + examples=['a6dc6c6e20acda010fe14d71f180658f2896ed9b4ec25aa99a6ff06c796987c4'], + max_length=64, + min_length=64, + ) + + +class Status10(str, Enum): in_progress = 'in-progress' @@ -2379,10 +3024,860 @@ class StabilityGetResultResponse202(BaseModel): id: Optional[str] = Field( None, description='The ID of the generation result.', examples=[1234567890] ) - status: Optional[Status9] = None + status: Optional[Status10] = None -class Type20(str, Enum): +class AspectRatio3(str, Enum): + field_21_9 = '21:9' + field_16_9 = '16:9' + field_3_2 = '3:2' + field_5_4 = '5:4' + field_1_1 = '1:1' + field_4_5 = '4:5' + field_2_3 = '2:3' + field_9_16 = '9:16' + field_9_21 = '9:21' + + +class Mode(str, Enum): + text_to_image = 'text-to-image' + image_to_image = 'image-to-image' + + +class Model5(str, Enum): + sd3_5_large = 'sd3.5-large' + sd3_5_large_turbo = 'sd3.5-large-turbo' + sd3_5_medium = 'sd3.5-medium' + + +class OutputFormat3(str, Enum): + png = 'png' + jpeg = 'jpeg' + + +class StylePreset(str, Enum): + enhance = 'enhance' + anime = 'anime' + photographic = 'photographic' + digital_art = 'digital-art' + comic_book = 'comic-book' + fantasy_art = 'fantasy-art' + line_art = 'line-art' + analog_film = 'analog-film' + neon_punk = 'neon-punk' + isometric = 'isometric' + low_poly = 'low-poly' + origami = 'origami' + modeling_compound = 'modeling-compound' + cinematic = 'cinematic' + field_3d_model = '3d-model' + pixel_art = 'pixel-art' + tile_texture = 'tile-texture' + + +class StabilityImageGenerationSD3Request(BaseModel): + aspect_ratio: Optional[AspectRatio3] = Field( + '1:1', + description='Controls the aspect ratio of the generated image. Defaults to 1:1.\n\n> **Important:** This parameter is only valid for **text-to-image** requests.', + ) + cfg_scale: Optional[float] = Field( + None, + description='How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt). The _Large_ and _Medium_ models use a default of `4`. The _Turbo_ model uses a default of `1`.', + ge=1.0, + le=10.0, + ) + image: Optional[StrictBytes] = Field( + None, + description='The image to use as the starting point for the generation.\n\nSupported formats:\n\n\n\n - jpeg\n - png\n - webp\n\nSupported dimensions:\n\n\n\n - Every side must be at least 64 pixels\n\n> **Important:** This parameter is only valid for **image-to-image** requests.', + ) + mode: Optional[Mode] = Field( + 'text-to-image', + description='Controls whether this is a text-to-image or image-to-image generation, which affects which parameters are required:\n- **text-to-image** requires only the `prompt` parameter\n- **image-to-image** requires the `prompt`, `image`, and `strength` parameters', + title='GenerationMode', + ) + model: Optional[Model5] = Field( + 'sd3.5-large', + description='The model to use for generation.\n\n- `sd3.5-large` requires 6.5 credits per generation\n- `sd3.5-large-turbo` requires 4 credits per generation\n- `sd3.5-medium` requires 3.5 credits per generation\n- As of the April 17, 2025, `sd3-large`, `sd3-large-turbo` and `sd3-medium`\n\n\n\n are re-routed to their `sd3.5-[model version]` equivalent, at the same price.', + ) + negative_prompt: Optional[str] = Field( + None, + description='Keywords of what you **do not** wish to see in the output image.\nThis is an advanced feature.', + max_length=10000, + ) + output_format: Optional[OutputFormat3] = Field( + 'png', description='Dictates the `content-type` of the generated image.' + ) + prompt: str = Field( + ..., + description='What you wish to see in the output image. A strong, descriptive prompt that clearly defines\nelements, colors, and subjects will lead to better results.', + max_length=10000, + min_length=1, + ) + seed: Optional[float] = Field( + 0, + description="A specific value that is used to guide the 'randomness' of the generation. (Omit this parameter or pass `0` to use a random seed.)", + ge=0.0, + le=4294967294.0, + ) + strength: Optional[float] = Field( + None, + description='Sometimes referred to as _denoising_, this parameter controls how much influence the\n`image` parameter has on the generated image. A value of 0 would yield an image that\nis identical to the input. A value of 1 would be as if you passed in no image at all.\n\n> **Important:** This parameter is only valid for **image-to-image** requests.', + ge=0.0, + le=1.0, + ) + style_preset: Optional[StylePreset] = Field( + None, description='Guides the image model towards a particular style.' + ) + + +class FinishReason(str, Enum): + SUCCESS = 'SUCCESS' + CONTENT_FILTERED = 'CONTENT_FILTERED' + + +class StabilityImageGenrationSD3Response200(BaseModel): + finish_reason: FinishReason = Field( + ..., + description='The reason the generation finished.\n\n- `SUCCESS` = successful generation.\n- `CONTENT_FILTERED` = successful generation, however the output violated our content moderation\npolicy and has been blurred as a result.', + examples=['SUCCESS'], + ) + image: str = Field( + ..., + description='The generated image, encoded to base64.', + examples=['AAAAIGZ0eXBpc29tAAACAGlzb21pc28yYXZjMW1...'], + ) + seed: Optional[float] = Field( + 0, + description='The seed used as random noise for this generation.', + examples=[343940597], + ge=0.0, + le=4294967294.0, + ) + + +class StabilityImageGenrationSD3Response400(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationSD3Response413(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationSD3Response422(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationSD3Response429(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationSD3Response500(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class OutputFormat4(str, Enum): + jpeg = 'jpeg' + png = 'png' + webp = 'webp' + + +class StabilityImageGenrationUpscaleConservativeRequest(BaseModel): + creativity: Optional[StabilityCreativity] = Field( + default_factory=lambda: StabilityCreativity.model_validate(0.35) + ) + image: StrictBytes = Field( + ..., + description='The image you wish to upscale.\n\nSupported Formats:\n- jpeg\n- png\n- webp\n\nValidation Rules:\n- Every side must be at least 64 pixels\n- Total pixel count must be between 4,096 and 9,437,184 pixels\n- The aspect ratio must be between 1:2.5 and 2.5:1', + examples=['./some/image.png'], + ) + negative_prompt: Optional[str] = Field( + None, + description='A blurb of text describing what you **do not** wish to see in the output image.\nThis is an advanced feature.', + max_length=10000, + ) + output_format: Optional[OutputFormat4] = Field( + 'png', description='Dictates the `content-type` of the generated image.' + ) + prompt: str = Field( + ..., + description="What you wish to see in the output image. A strong, descriptive prompt that clearly defines\nelements, colors, and subjects will lead to better results.\n\nTo control the weight of a given word use the format `(word:weight)`,\nwhere `word` is the word you'd like to control the weight of and `weight`\nis a value between 0 and 1. For example: `The sky was a crisp (blue:0.3) and (green:0.8)`\nwould convey a sky that was blue and green, but more green than blue.", + max_length=10000, + min_length=1, + ) + seed: Optional[float] = Field( + 0, + description="A specific value that is used to guide the 'randomness' of the generation. (Omit this parameter or pass `0` to use a random seed.)", + ge=0.0, + le=4294967294.0, + ) + + +class StabilityImageGenrationUpscaleConservativeResponse200(BaseModel): + finish_reason: FinishReason = Field( + ..., + description='The reason the generation finished.\n\n- `SUCCESS` = successful generation.\n- `CONTENT_FILTERED` = successful generation, however the output violated our content moderation\npolicy and has been blurred as a result.', + examples=['SUCCESS'], + ) + image: str = Field( + ..., + description='The generated image, encoded to base64.', + examples=['AAAAIGZ0eXBpc29tAAACAGlzb21pc28yYXZjMW1...'], + ) + seed: Optional[float] = Field( + 0, + description='The seed used as random noise for this generation.', + examples=[343940597], + ge=0.0, + le=4294967294.0, + ) + + +class StabilityImageGenrationUpscaleConservativeResponse400(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleConservativeResponse413(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleConservativeResponse422(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleConservativeResponse429(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleConservativeResponse500(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleCreativeRequest(BaseModel): + creativity: Optional[float] = Field( + 0.3, + description='Indicates how creative the model should be when upscaling an image.\nHigher values will result in more details being added to the image during upscaling.', + ge=0.1, + le=0.5, + ) + image: StrictBytes = Field( + ..., + description='The image you wish to upscale.\n\nSupported Formats:\n- jpeg\n- png\n- webp\n\nValidation Rules:\n- Every side must be at least 64 pixels\n- Total pixel count must be between 4,096 and 1,048,576 pixels', + examples=['./some/image.png'], + ) + negative_prompt: Optional[str] = Field( + None, + description='A blurb of text describing what you **do not** wish to see in the output image.\nThis is an advanced feature.', + max_length=10000, + ) + output_format: Optional[OutputFormat4] = Field( + 'png', description='Dictates the `content-type` of the generated image.' + ) + prompt: str = Field( + ..., + description="What you wish to see in the output image. A strong, descriptive prompt that clearly defines\nelements, colors, and subjects will lead to better results.\n\nTo control the weight of a given word use the format `(word:weight)`,\nwhere `word` is the word you'd like to control the weight of and `weight`\nis a value between 0 and 1. For example: `The sky was a crisp (blue:0.3) and (green:0.8)`\nwould convey a sky that was blue and green, but more green than blue.", + max_length=10000, + min_length=1, + ) + seed: Optional[float] = Field( + 0, + description="A specific value that is used to guide the 'randomness' of the generation. (Omit this parameter or pass `0` to use a random seed.)", + ge=0.0, + le=4294967294.0, + ) + style_preset: Optional[StylePreset] = Field( + None, description='Guides the image model towards a particular style.' + ) + + +class StabilityImageGenrationUpscaleCreativeResponse200(BaseModel): + id: StabilityGenerationID + + +class StabilityImageGenrationUpscaleCreativeResponse400(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleCreativeResponse413(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleCreativeResponse422(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleCreativeResponse429(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleCreativeResponse500(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleFastRequest(BaseModel): + image: StrictBytes = Field( + ..., + description='The image you wish to upscale.\n\nSupported Formats:\n- jpeg\n- png\n- webp\n\nValidation Rules:\n- Width must be between 32 and 1,536 pixels\n- Height must be between 32 and 1,536 pixels\n- Total pixel count must be between 1,024 and 1,048,576 pixels', + examples=['./some/image.png'], + ) + output_format: Optional[OutputFormat4] = Field( + 'png', description='Dictates the `content-type` of the generated image.' + ) + + +class StabilityImageGenrationUpscaleFastResponse200(BaseModel): + finish_reason: FinishReason = Field( + ..., + description='The reason the generation finished.\n\n- `SUCCESS` = successful generation.\n- `CONTENT_FILTERED` = successful generation, however the output violated our content moderation\npolicy and has been blurred as a result.', + examples=['SUCCESS'], + ) + image: str = Field( + ..., + description='The generated image, encoded to base64.', + examples=['AAAAIGZ0eXBpc29tAAACAGlzb21pc28yYXZjMW1...'], + ) + seed: Optional[float] = Field( + 0, + description='The seed used as random noise for this generation.', + examples=[343940597], + ge=0.0, + le=4294967294.0, + ) + + +class StabilityImageGenrationUpscaleFastResponse400(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleFastResponse413(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleFastResponse422(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleFastResponse429(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityImageGenrationUpscaleFastResponse500(BaseModel): + errors: List[str] = Field( + ..., + description='One or more error messages indicating what went wrong.', + examples=[['some-field: is required']], + min_length=1, + ) + id: str = Field( + ..., + description='A unique identifier associated with this error. Please include this in any [support tickets](https://kb.stability.ai/knowledge-base/kb-tickets/new)\nyou file, as it will greatly assist us in diagnosing the root cause of the problem.', + examples=['a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4'], + min_length=1, + ) + name: str = Field( + ..., + description='Short-hand name for an error, useful for discriminating between errors with the same status code.', + examples=['bad_request'], + min_length=1, + ) + + +class StabilityStabilityClientID(RootModel[str]): + root: str = Field( + ..., + description='The name of your application, used to help us communicate app-specific debugging or moderation issues to you.', + examples=['my-awesome-app'], + max_length=256, + ) + + +class StabilityStabilityClientUserID(RootModel[str]): + root: str = Field( + ..., + description='A unique identifier for your end user. Used to help us communicate user-specific debugging or moderation issues to you. Feel free to obfuscate this value to protect user privacy.', + examples=['DiscordUser#9999'], + max_length=256, + ) + + +class StabilityStabilityClientVersion(RootModel[str]): + root: str = Field( + ..., + description='The version of your application, used to help us communicate version-specific debugging or moderation issues to you.', + examples=['1.2.1'], + max_length=256, + ) + + +class StorageFile(BaseModel): + file_path: Optional[str] = Field(None, description='Path to the file in storage') + id: Optional[UUID] = Field( + None, description='Unique identifier for the storage file' + ) + public_url: Optional[str] = Field(None, description='Public URL') + + +class StripeAddress(BaseModel): + city: Optional[str] = None + country: Optional[str] = None + line1: Optional[str] = None + line2: Optional[str] = None + postal_code: Optional[str] = None + state: Optional[str] = None + + +class StripeAmountDetails(BaseModel): + tip: Optional[Dict[str, Any]] = None + + +class StripeBillingDetails(BaseModel): + address: Optional[StripeAddress] = None + email: Optional[str] = None + name: Optional[str] = None + phone: Optional[str] = None + tax_id: Optional[Any] = None + + +class Checks(BaseModel): + address_line1_check: Optional[Any] = None + address_postal_code_check: Optional[Any] = None + cvc_check: Optional[str] = None + + +class ExtendedAuthorization(BaseModel): + status: Optional[str] = None + + +class IncrementalAuthorization(BaseModel): + status: Optional[str] = None + + +class Multicapture(BaseModel): + status: Optional[str] = None + + +class NetworkToken(BaseModel): + used: Optional[bool] = None + + +class Overcapture(BaseModel): + maximum_amount_capturable: Optional[int] = None + status: Optional[str] = None + + +class StripeCardDetails(BaseModel): + amount_authorized: Optional[int] = None + authorization_code: Optional[Any] = None + brand: Optional[str] = None + checks: Optional[Checks] = None + country: Optional[str] = None + exp_month: Optional[int] = None + exp_year: Optional[int] = None + extended_authorization: Optional[ExtendedAuthorization] = None + fingerprint: Optional[str] = None + funding: Optional[str] = None + incremental_authorization: Optional[IncrementalAuthorization] = None + installments: Optional[Any] = None + last4: Optional[str] = None + mandate: Optional[Any] = None + multicapture: Optional[Multicapture] = None + network: Optional[str] = None + network_token: Optional[NetworkToken] = None + network_transaction_id: Optional[str] = None + overcapture: Optional[Overcapture] = None + regulated_status: Optional[str] = None + three_d_secure: Optional[Any] = None + wallet: Optional[Any] = None + + +class Object1(str, Enum): + charge = 'charge' + + +class Object2(str, Enum): + event = 'event' + + +class Type31(str, Enum): + payment_intent_succeeded = 'payment_intent.succeeded' + + +class StripeOutcome(BaseModel): + advice_code: Optional[Any] = None + network_advice_code: Optional[Any] = None + network_decline_code: Optional[Any] = None + network_status: Optional[str] = None + reason: Optional[Any] = None + risk_level: Optional[str] = None + risk_score: Optional[int] = None + seller_message: Optional[str] = None + type: Optional[str] = None + + +class Object3(str, Enum): + payment_intent = 'payment_intent' + + +class StripePaymentMethodDetails(BaseModel): + card: Optional[StripeCardDetails] = None + type: Optional[str] = None + + +class Card(BaseModel): + installments: Optional[Any] = None + mandate_options: Optional[Any] = None + network: Optional[Any] = None + request_three_d_secure: Optional[str] = None + + +class StripePaymentMethodOptions(BaseModel): + card: Optional[Card] = None + + +class StripeRefundList(BaseModel): + data: Optional[List[Dict[str, Any]]] = None + has_more: Optional[bool] = None + object: Optional[str] = None + total_count: Optional[int] = None + url: Optional[str] = None + + +class StripeRequestInfo(BaseModel): + id: Optional[str] = None + idempotency_key: Optional[str] = None + + +class StripeShipping(BaseModel): + address: Optional[StripeAddress] = None + carrier: Optional[str] = None + name: Optional[str] = None + phone: Optional[str] = None + tracking_number: Optional[str] = None + + +class Type32(str, Enum): json_schema = 'json_schema' @@ -2400,19 +3895,19 @@ class TextResponseFormatJsonSchema(BaseModel): False, description='Whether to enable strict schema adherence when generating the output.\nIf set to true, the model will always follow the exact schema defined\nin the `schema` field. Only a subset of JSON Schema is supported when\n`strict` is `true`. To learn more, read the [Structured Outputs\nguide](/docs/guides/structured-outputs).\n', ) - type: Type20 = Field( + type: Type32 = Field( ..., description='The type of response format being defined. Always `json_schema`.', ) -class Type21(str, Enum): +class Type33(str, Enum): function = 'function' class ToolChoiceFunction(BaseModel): name: str = Field(..., description='The name of the function to call.') - type: Type21 = Field( + type: Type33 = Field( ..., description='For function calling, the type is always `function`.' ) @@ -2423,7 +3918,7 @@ class ToolChoiceOptions(str, Enum): required = 'required' -class Type22(str, Enum): +class Type34(str, Enum): file_search = 'file_search' web_search_preview = 'web_search_preview' computer_use_preview = 'computer_use_preview' @@ -2431,7 +3926,7 @@ class Type22(str, Enum): class ToolChoiceTypes(BaseModel): - type: Type22 = Field( + type: Type34 = Field( ..., description='The type of hosted tool the model should to use. Learn more about\n[built-in tools](/docs/guides/tools).\n\nAllowed values are:\n- `file_search`\n- `web_search_preview`\n- `computer_use_preview`\n', ) @@ -2499,9 +3994,9 @@ class TripoModelStyle(str, Enum): class TripoModelVersion(str, Enum): - V2_5 = 'v2.5-20250123' - V2_0 = 'v2.0-20240919' - V1_4 = 'v1.4-20240625' + v2_5_20250123 = 'v2.5-20250123' + v2_0_20240919 = 'v2.0-20240919' + v1_4_20240625 = 'v1.4-20240625' class TripoMultiviewMode(str, Enum): @@ -2547,13 +4042,13 @@ class Code1(int, Enum): integer_0 = 0 -class Data8(BaseModel): +class Data9(BaseModel): task_id: str = Field(..., description='used for getTask') class TripoSuccessTask(BaseModel): code: Code1 - data: Data8 + data: Data9 class Topology(str, Enum): @@ -2570,7 +4065,7 @@ class Output(BaseModel): topology: Optional[Topology] = None -class Status10(str, Enum): +class Status11(str, Enum): queued = 'queued' running = 'running' success = 'success' @@ -2586,7 +4081,7 @@ class TripoTask(BaseModel): input: Dict[str, Any] output: Output progress: int = Field(..., ge=0, le=100) - status: Status10 + status: Status11 task_id: str type: str @@ -2650,6 +4145,18 @@ class TripoTypeTextureModel(str, Enum): texture_model = 'texture_model' +class User(BaseModel): + email: Optional[str] = Field(None, description='The email address for this user.') + id: Optional[str] = Field(None, description='The unique id for this user.') + isAdmin: Optional[bool] = Field( + None, description='Indicates if the user has admin privileges.' + ) + isApproved: Optional[bool] = Field( + None, description='Indicates if the user is approved.' + ) + name: Optional[str] = Field(None, description='The name for this user.') + + class Veo2GenVidPollRequest(BaseModel): operationName: str = Field( ..., @@ -2660,7 +4167,7 @@ class Veo2GenVidPollRequest(BaseModel): ) -class Error(BaseModel): +class Error1(BaseModel): code: Optional[int] = Field(None, description='Error code') message: Optional[str] = Field(None, description='Error message') @@ -2692,7 +4199,7 @@ class Response(BaseModel): class Veo2GenVidPollResponse(BaseModel): done: Optional[bool] = None - error: Optional[Error] = Field( + error: Optional[Error1] = Field( None, description='Error details if operation failed' ) name: Optional[str] = None @@ -2753,13 +4260,102 @@ class Veo2GenVidResponse(BaseModel): ) +class VeoGenVidPollRequest(BaseModel): + operationName: str = Field( + ..., + description='Full operation name (from predict response)', + examples=[ + 'projects/PROJECT_ID/locations/us-central1/publishers/google/models/MODEL_ID/operations/OPERATION_ID' + ], + ) + + +class Response1(BaseModel): + field_type: Optional[str] = Field( + None, + alias='@type', + examples=[ + 'type.googleapis.com/cloud.ai.large_models.vision.GenerateVideoResponse' + ], + ) + raiMediaFilteredCount: Optional[int] = Field( + None, description='Count of media filtered by responsible AI policies' + ) + raiMediaFilteredReasons: Optional[List[str]] = Field( + None, description='Reasons why media was filtered by responsible AI policies' + ) + videos: Optional[List[Video]] = None + + +class VeoGenVidPollResponse(BaseModel): + done: Optional[bool] = None + error: Optional[Error1] = Field( + None, description='Error details if operation failed' + ) + name: Optional[str] = None + response: Optional[Response1] = Field( + None, description='The actual prediction response if done is true' + ) + + +class Image2(BaseModel): + bytesBase64Encoded: str + gcsUri: Optional[str] = None + mimeType: Optional[str] = None + + +class Image3(BaseModel): + bytesBase64Encoded: Optional[str] = None + gcsUri: str + mimeType: Optional[str] = None + + +class Instance1(BaseModel): + image: Optional[Union[Image2, Image3]] = Field( + None, description='Optional image to guide video generation' + ) + prompt: str = Field(..., description='Text description of the video') + + +class Parameters1(BaseModel): + aspectRatio: Optional[str] = Field(None, examples=['16:9']) + durationSeconds: Optional[int] = None + enhancePrompt: Optional[bool] = None + generateAudio: Optional[bool] = Field( + None, + description='Generate audio for the video. Only supported by veo 3 models.', + ) + negativePrompt: Optional[str] = None + personGeneration: Optional[PersonGeneration1] = None + sampleCount: Optional[int] = None + seed: Optional[int] = None + storageUri: Optional[str] = Field( + None, description='Optional Cloud Storage URI to upload the video' + ) + + +class VeoGenVidRequest(BaseModel): + instances: Optional[List[Instance1]] = None + parameters: Optional[Parameters1] = None + + +class VeoGenVidResponse(BaseModel): + name: str = Field( + ..., + description='Operation resource name', + examples=[ + 'projects/PROJECT_ID/locations/us-central1/publishers/google/models/MODEL_ID/operations/a1b07c8e-7b5a-4aba-bb34-3e1ccb8afcc8' + ], + ) + + class SearchContextSize(str, Enum): low = 'low' medium = 'medium' high = 'high' -class Type23(str, Enum): +class Type35(str, Enum): web_search_preview = 'web_search_preview' web_search_preview_2025_03_11 = 'web_search_preview_2025_03_11' @@ -2775,30 +4371,348 @@ class WebSearchPreviewTool(BaseModel): ) -class Status11(str, Enum): +class Status12(str, Enum): in_progress = 'in_progress' searching = 'searching' completed = 'completed' failed = 'failed' -class Type24(str, Enum): +class Type36(str, Enum): web_search_call = 'web_search_call' class WebSearchToolCall(BaseModel): id: str = Field(..., description='The unique ID of the web search tool call.\n') - status: Status11 = Field( + status: Status12 = Field( ..., description='The status of the web search tool call.\n' ) - type: Type24 = Field( + type: Type36 = Field( ..., description='The type of the web search tool call. Always `web_search_call`.\n', ) -class CreateModelResponseProperties(ModelResponseProperties): - pass +class WorkflowRunStatus(str, Enum): + WorkflowRunStatusStarted = 'WorkflowRunStatusStarted' + WorkflowRunStatusFailed = 'WorkflowRunStatusFailed' + WorkflowRunStatusCompleted = 'WorkflowRunStatusCompleted' + + +class ActionJobResult(BaseModel): + action_job_id: Optional[str] = Field( + None, description='Identifier of the job this result belongs to' + ) + action_run_id: Optional[str] = Field( + None, description='Identifier of the run this result belongs to' + ) + author: Optional[str] = Field(None, description='The author of the commit') + avg_vram: Optional[int] = Field( + None, description='The average VRAM used by the job' + ) + branch_name: Optional[str] = Field( + None, description='Name of the relevant git branch' + ) + comfy_run_flags: Optional[str] = Field( + None, description='The comfy run flags. E.g. `--low-vram`' + ) + commit_hash: Optional[str] = Field(None, description='The hash of the commit') + commit_id: Optional[str] = Field(None, description='The ID of the commit') + commit_message: Optional[str] = Field(None, description='The message of the commit') + commit_time: Optional[int] = Field( + None, description='The Unix timestamp when the commit was made' + ) + cuda_version: Optional[str] = Field(None, description='CUDA version used') + end_time: Optional[int] = Field( + None, description='The end time of the job as a Unix timestamp.' + ) + git_repo: Optional[str] = Field(None, description='The repository name') + id: Optional[UUID] = Field(None, description='Unique identifier for the job result') + job_trigger_user: Optional[str] = Field( + None, description='The user who triggered the job.' + ) + machine_stats: Optional[MachineStats] = None + operating_system: Optional[str] = Field(None, description='Operating system used') + peak_vram: Optional[int] = Field(None, description='The peak VRAM used by the job') + pr_number: Optional[str] = Field(None, description='The pull request number') + python_version: Optional[str] = Field(None, description='PyTorch version used') + pytorch_version: Optional[str] = Field(None, description='PyTorch version used') + start_time: Optional[int] = Field( + None, description='The start time of the job as a Unix timestamp.' + ) + status: Optional[WorkflowRunStatus] = None + storage_file: Optional[StorageFile] = None + workflow_name: Optional[str] = Field(None, description='Name of the workflow') + + +class BFLCannyInputs(BaseModel): + canny_high_threshold: Optional[CannyHighThreshold] = Field( + default_factory=lambda: CannyHighThreshold.model_validate(200), + description='High threshold for Canny edge detection', + title='Canny High Threshold', + ) + canny_low_threshold: Optional[CannyLowThreshold] = Field( + default_factory=lambda: CannyLowThreshold.model_validate(50), + description='Low threshold for Canny edge detection', + title='Canny Low Threshold', + ) + control_image: Optional[str] = Field( + None, + description='Base64 encoded image to use as control input if no preprocessed image is provided', + title='Control Image', + ) + guidance: Optional[Guidance] = Field( + default_factory=lambda: Guidance.model_validate(30), + description='Guidance strength for the image generation process', + title='Guidance', + ) + output_format: Optional[BFLOutputFormat] = Field( + 'jpeg', + description="Output format for the generated image. Can be 'jpeg' or 'png'.", + ) + preprocessed_image: Optional[str] = Field( + None, + description='Optional pre-processed image that will bypass the control preprocessing step', + title='Preprocessed Image', + ) + prompt: str = Field( + ..., + description='Text prompt for image generation', + examples=['ein fantastisches bild'], + title='Prompt', + ) + prompt_upsampling: Optional[bool] = Field( + False, + description='Whether to perform upsampling on the prompt', + title='Prompt Upsampling', + ) + safety_tolerance: Optional[int] = Field( + 2, + description='Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict.', + ge=0, + le=6, + title='Safety Tolerance', + ) + seed: Optional[int] = Field( + None, + description='Optional seed for reproducibility', + examples=[42], + title='Seed', + ) + steps: Optional[Steps] = Field( + default_factory=lambda: Steps.model_validate(50), + description='Number of steps for the image generation process', + title='Steps', + ) + webhook_secret: Optional[str] = Field( + None, + description='Optional secret for webhook signature verification', + title='Webhook Secret', + ) + webhook_url: Optional[WebhookUrl] = Field( + None, description='URL to receive webhook notifications', title='Webhook Url' + ) + + +class BFLDepthInputs(BaseModel): + control_image: Optional[str] = Field( + None, + description='Base64 encoded image to use as control input', + title='Control Image', + ) + guidance: Optional[Guidance] = Field( + default_factory=lambda: Guidance.model_validate(15), + description='Guidance strength for the image generation process', + title='Guidance', + ) + output_format: Optional[BFLOutputFormat] = Field( + 'jpeg', + description="Output format for the generated image. Can be 'jpeg' or 'png'.", + ) + preprocessed_image: Optional[str] = Field( + None, + description='Optional pre-processed image that will bypass the control preprocessing step', + title='Preprocessed Image', + ) + prompt: str = Field( + ..., + description='Text prompt for image generation', + examples=['ein fantastisches bild'], + title='Prompt', + ) + prompt_upsampling: Optional[bool] = Field( + False, + description='Whether to perform upsampling on the prompt', + title='Prompt Upsampling', + ) + safety_tolerance: Optional[int] = Field( + 2, + description='Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict.', + ge=0, + le=6, + title='Safety Tolerance', + ) + seed: Optional[int] = Field( + None, + description='Optional seed for reproducibility', + examples=[42], + title='Seed', + ) + steps: Optional[Steps] = Field( + default_factory=lambda: Steps.model_validate(50), + description='Number of steps for the image generation process', + title='Steps', + ) + webhook_secret: Optional[str] = Field( + None, + description='Optional secret for webhook signature verification', + title='Webhook Secret', + ) + webhook_url: Optional[WebhookUrl] = Field( + None, description='URL to receive webhook notifications', title='Webhook Url' + ) + + +class BFLFluxProExpandInputs(BaseModel): + bottom: Optional[Bottom] = Field( + 0, + description='Number of pixels to expand at the bottom of the image', + title='Bottom', + ) + guidance: Optional[Guidance2] = Field( + default_factory=lambda: Guidance2.model_validate(60), + description='Guidance strength for the image generation process', + title='Guidance', + ) + image: str = Field( + ..., + description='A Base64-encoded string representing the image you wish to expand.', + title='Image', + ) + left: Optional[Left] = Field( + 0, + description='Number of pixels to expand on the left side of the image', + title='Left', + ) + output_format: Optional[BFLOutputFormat] = Field( + 'jpeg', + description="Output format for the generated image. Can be 'jpeg' or 'png'.", + ) + prompt: Optional[str] = Field( + '', + description='The description of the changes you want to make. This text guides the expansion process, allowing you to specify features, styles, or modifications for the expanded areas.', + examples=['ein fantastisches bild'], + title='Prompt', + ) + prompt_upsampling: Optional[bool] = Field( + False, + description='Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation', + title='Prompt Upsampling', + ) + right: Optional[Right] = Field( + 0, + description='Number of pixels to expand on the right side of the image', + title='Right', + ) + safety_tolerance: Optional[int] = Field( + 2, + description='Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict.', + examples=[2], + ge=0, + le=6, + title='Safety Tolerance', + ) + seed: Optional[int] = Field( + None, description='Optional seed for reproducibility', title='Seed' + ) + steps: Optional[Steps2] = Field( + default_factory=lambda: Steps2.model_validate(50), + description='Number of steps for the image generation process', + examples=[50], + title='Steps', + ) + top: Optional[Top] = Field( + 0, description='Number of pixels to expand at the top of the image', title='Top' + ) + webhook_secret: Optional[str] = Field( + None, + description='Optional secret for webhook signature verification', + title='Webhook Secret', + ) + webhook_url: Optional[WebhookUrl] = Field( + None, description='URL to receive webhook notifications', title='Webhook Url' + ) + + +class BFLFluxProFillInputs(BaseModel): + guidance: Optional[Guidance2] = Field( + default_factory=lambda: Guidance2.model_validate(60), + description='Guidance strength for the image generation process', + title='Guidance', + ) + image: str = Field( + ..., + description='A Base64-encoded string representing the image you wish to modify. Can contain alpha mask if desired.', + title='Image', + ) + mask: Optional[str] = Field( + None, + description='A Base64-encoded string representing a mask for the areas you want to modify in the image. The mask should be the same dimensions as the image and in black and white. Black areas (0%) indicate no modification, while white areas (100%) specify areas for inpainting. Optional if you provide an alpha mask in the original image. Validation: The endpoint verifies that the dimensions of the mask match the original image.', + title='Mask', + ) + output_format: Optional[BFLOutputFormat] = Field( + 'jpeg', + description="Output format for the generated image. Can be 'jpeg' or 'png'.", + ) + prompt: Optional[str] = Field( + '', + description='The description of the changes you want to make. This text guides the inpainting process, allowing you to specify features, styles, or modifications for the masked area.', + examples=['ein fantastisches bild'], + title='Prompt', + ) + prompt_upsampling: Optional[bool] = Field( + False, + description='Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation', + title='Prompt Upsampling', + ) + safety_tolerance: Optional[int] = Field( + 2, + description='Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict.', + examples=[2], + ge=0, + le=6, + title='Safety Tolerance', + ) + seed: Optional[int] = Field( + None, description='Optional seed for reproducibility', title='Seed' + ) + steps: Optional[Steps2] = Field( + default_factory=lambda: Steps2.model_validate(50), + description='Number of steps for the image generation process', + examples=[50], + title='Steps', + ) + webhook_secret: Optional[str] = Field( + None, + description='Optional secret for webhook signature verification', + title='Webhook Secret', + ) + webhook_url: Optional[WebhookUrl] = Field( + None, description='URL to receive webhook notifications', title='Webhook Url' + ) + + +class BFLHTTPValidationError(BaseModel): + detail: Optional[List[BFLValidationError]] = Field(None, title='Detail') + + +class BulkNodeVersionsRequest(BaseModel): + node_versions: List[NodeVersionIdentifier] = Field( + ..., description='List of node ID and version pairs to retrieve' + ) + + +CreateModelResponseProperties = ModelResponseProperties class GeminiInlineData(BaseModel): @@ -2841,6 +4755,125 @@ class GeminiSystemInstructionContent(BaseModel): ) +class GeminiUsageMetadata(BaseModel): + cachedContentTokenCount: Optional[int] = Field( + None, + description='Output only. Number of tokens in the cached part in the input (the cached content).', + ) + candidatesTokenCount: Optional[int] = Field( + None, description='Number of tokens in the response(s).' + ) + candidatesTokensDetails: Optional[List[ModalityTokenCount]] = Field( + None, description='Breakdown of candidate tokens by modality.' + ) + promptTokenCount: Optional[int] = Field( + None, + description='Number of tokens in the request. When cachedContent is set, this is still the total effective prompt size meaning this includes the number of tokens in the cached content.', + ) + promptTokensDetails: Optional[List[ModalityTokenCount]] = Field( + None, description='Breakdown of prompt tokens by modality.' + ) + thoughtsTokenCount: Optional[int] = Field( + None, description='Number of tokens present in thoughts output.' + ) + toolUsePromptTokenCount: Optional[int] = Field( + None, description='Number of tokens present in tool-use prompt(s).' + ) + + +class GithubInstallation(BaseModel): + access_tokens_url: str = Field(..., description='The API URL for access tokens') + account: GithubUser + app_id: int = Field(..., description='The GitHub App ID') + created_at: datetime = Field(..., description='When the installation was created') + events: List[str] = Field( + ..., description='The events the installation subscribes to' + ) + html_url: str = Field(..., description='The HTML URL of the installation') + id: int = Field(..., description='The installation ID') + permissions: Dict[str, Any] = Field(..., description='The installation permissions') + repositories_url: str = Field(..., description='The API URL for repositories') + repository_selection: RepositorySelection = Field( + ..., description='Repository selection for the installation' + ) + single_file_name: Optional[str] = Field( + None, description='The single file name if applicable' + ) + target_id: int = Field(..., description='The target ID') + target_type: str = Field(..., description='The target type') + updated_at: datetime = Field( + ..., description='When the installation was last updated' + ) + + +class GithubReleaseAsset(BaseModel): + browser_download_url: str = Field(..., description='The browser download URL') + content_type: str = Field(..., description='The content type of the asset') + created_at: datetime = Field(..., description='When the asset was created') + download_count: int = Field(..., description='The number of downloads') + id: int = Field(..., description='The asset ID') + label: Optional[str] = Field(None, description='The label of the asset') + name: str = Field(..., description='The name of the asset') + node_id: str = Field(..., description='The asset node ID') + size: int = Field(..., description='The size of the asset in bytes') + state: State = Field(..., description='The state of the asset') + updated_at: datetime = Field(..., description='When the asset was last updated') + uploader: GithubUser + + +class Release(BaseModel): + assets: List[GithubReleaseAsset] = Field(..., description='Array of release assets') + assets_url: Optional[str] = Field(None, description='The URL to the release assets') + author: GithubUser + body: Optional[str] = Field(None, description='The release notes/body') + created_at: datetime = Field(..., description='When the release was created') + draft: bool = Field(..., description='Whether the release is a draft') + html_url: str = Field(..., description='The HTML URL of the release') + id: int = Field(..., description='The ID of the release') + name: Optional[str] = Field(None, description='The name of the release') + node_id: str = Field(..., description='The node ID of the release') + prerelease: bool = Field(..., description='Whether the release is a prerelease') + published_at: Optional[datetime] = Field( + None, description='When the release was published' + ) + tag_name: str = Field(..., description='The tag name of the release') + tarball_url: str = Field(..., description='URL to the tarball') + target_commitish: str = Field( + ..., description='The branch or commit the release was created from' + ) + upload_url: Optional[str] = Field( + None, description='The URL to upload release assets' + ) + url: str = Field(..., description='The API URL of the release') + zipball_url: str = Field(..., description='URL to the zipball') + + +class GithubRepository(BaseModel): + clone_url: str = Field(..., description='The clone URL of the repository') + created_at: datetime = Field(..., description='When the repository was created') + default_branch: str = Field(..., description='The default branch of the repository') + description: Optional[str] = Field(None, description='The repository description') + fork: bool = Field(..., description='Whether the repository is a fork') + full_name: str = Field( + ..., description='The full name of the repository (owner/repo)' + ) + git_url: str = Field(..., description='The git URL of the repository') + html_url: str = Field(..., description='The HTML URL of the repository') + id: int = Field(..., description='The repository ID') + name: str = Field(..., description='The name of the repository') + node_id: str = Field(..., description='The repository node ID') + owner: GithubUser + private: bool = Field(..., description='Whether the repository is private') + pushed_at: datetime = Field( + ..., description='When the repository was last pushed to' + ) + ssh_url: str = Field(..., description='The SSH URL of the repository') + updated_at: datetime = Field( + ..., description='When the repository was last updated' + ) + url: str = Field(..., description='The API URL of the repository') + + class IdeogramV3EditRequest(BaseModel): color_palette: Optional[IdeogramColorPalette] = None image: Optional[StrictBytes] = Field( @@ -3276,6 +5309,52 @@ class MoonvalleyTextToImageRequest(BaseModel): webhook_url: Optional[str] = None +class NodeVersion(BaseModel): + changelog: Optional[str] = Field( + None, description='Summary of changes made in this version' + ) + comfy_node_extract_status: Optional[str] = Field( + None, description='The status of comfy node extraction process.' + ) + createdAt: Optional[datetime] = Field( + None, description='The date and time the version was created.' + ) + dependencies: Optional[List[str]] = Field( + None, description='A list of pip dependencies required by the node.' + ) + deprecated: Optional[bool] = Field( + None, description='Indicates if this version is deprecated.' + ) + downloadUrl: Optional[str] = Field( + None, description='[Output Only] URL to download this version of the node' + ) + id: Optional[str] = None + node_id: Optional[str] = Field( + None, description='The unique identifier of the node.' + ) + status: Optional[NodeVersionStatus] = None + status_reason: Optional[str] = Field( + None, description='The reason for the status change.' + ) + supported_accelerators: Optional[List[str]] = Field( + None, + description='List of accelerators (e.g. CUDA, DirectML, ROCm) that this node supports', + ) + supported_comfyui_frontend_version: Optional[str] = Field( + None, description='Supported versions of ComfyUI frontend' + ) + supported_comfyui_version: Optional[str] = Field( + None, description='Supported versions of ComfyUI' + ) + supported_os: Optional[List[str]] = Field( + None, description='List of operating systems that this node supports' + ) + version: Optional[str] = Field( + None, + description='The version identifier, following semantic versioning. Must be unique for the node.', + ) + + class OutputContent(RootModel[Union[OutputTextContent, OutputAudioContent]]): root: Union[OutputTextContent, OutputAudioContent] @@ -3283,7 +5362,7 @@ class OutputContent(RootModel[Union[OutputTextContent, OutputAudioContent]]): class OutputMessage(BaseModel): content: List[OutputContent] = Field(..., description='The content of the message') role: Role4 = Field(..., description='The role of the message') - type: Type14 = Field(..., description='The type of output item') + type: Type15 = Field(..., description='The type of output item') class PikaBodyGenerate22I2vGenerate22I2vPost(BaseModel): @@ -3333,6 +5412,16 @@ class PikaHTTPValidationError(BaseModel): detail: Optional[List[PikaValidationError]] = Field(None, title='Detail') +class PublisherMember(BaseModel): + id: Optional[str] = Field( + None, description='The unique identifier for the publisher member.' + ) + role: Optional[str] = Field( + None, description='The role of the user in the publisher.' + ) + user: Optional[PublisherUser] = None + + class Reasoning(BaseModel): effort: Optional[ReasoningEffort] = 'medium' generate_summary: Optional[GenerateSummary] = Field( @@ -3345,13 +5434,88 @@ class Reasoning(BaseModel): ) +class RecraftImage(BaseModel): + b64_json: Optional[str] = None + features: Optional[RecraftImageFeatures] = None + image_id: UUID + revised_prompt: Optional[str] = None + url: Optional[str] = None + + +class RecraftProcessImageRequest(BaseModel): + image: StrictBytes + image_format: Optional[RecraftImageFormat] = None + response_format: Optional[RecraftResponseFormat] = None + + +class RecraftProcessImageResponse(BaseModel): + created: int + credits: int + image: RecraftImage + + +class RecraftTextLayout(RootModel[List[RecraftTextLayoutItem]]): + root: List[RecraftTextLayoutItem] + + +class RecraftTransformImageWithMaskRequest(BaseModel): + block_nsfw: Optional[bool] = None + calculate_features: Optional[bool] = None + image: StrictBytes + image_format: Optional[RecraftImageFormat] = None + mask: StrictBytes + model: Optional[RecraftTransformModel] = None + n: Optional[int] = None + negative_prompt: Optional[str] = None + prompt: str + response_format: Optional[RecraftResponseFormat] = None + style: Optional[RecraftImageStyle] = None + style_id: Optional[UUID] = None + substyle: Optional[RecraftImageSubStyle] = None + text_layout: Optional[RecraftTextLayout] = None + + +class ResponseContentPartAddedEvent(BaseModel): + content_index: int = Field( + ..., description='The index of the content part that was added.' + ) + item_id: str = Field( + ..., description='The ID of the output item that the content part was added to.' + ) + output_index: int = Field( + ..., + description='The index of the output item that the content part was added to.', + ) + part: OutputContent + type: Type20 = Field( + ..., description='The type of the event. Always `response.content_part.added`.' + ) + + +class ResponseContentPartDoneEvent(BaseModel): + content_index: int = Field( + ..., description='The index of the content part that is done.' + ) + item_id: str = Field( + ..., description='The ID of the output item that the content part was added to.' + ) + output_index: int = Field( + ..., + description='The index of the output item that the content part was added to.', + ) + part: OutputContent + type: Type21 = Field( + ..., description='The type of the event. Always `response.content_part.done`.' + ) + + class ResponseError(BaseModel): code: ResponseErrorCode message: str = Field(..., description='A human-readable description of the error.') class Rodin3DDownloadResponse(BaseModel): - list: Optional[RodinResourceItem] = None + list: Optional[List[RodinResourceItem]] = None class Rodin3DGenerateRequest(BaseModel): @@ -3371,6 +5535,11 @@ class Rodin3DGenerateResponse(BaseModel): uuid: Optional[str] = Field(None, description='Task UUID') +class RodinCheckStatusJobItem(BaseModel): + status: Optional[RodinStatusOptions] = None + uuid: Optional[str] = Field(None, description='sub uuid') + + class RunwayImageToVideoRequest(BaseModel): duration: RunwayDurationEnum model: RunwayModelEnum @@ -3384,6 +5553,109 @@ class RunwayImageToVideoRequest(BaseModel): ) +class StripeCharge(BaseModel): + amount: Optional[int] = None + amount_captured: Optional[int] = None + amount_refunded: Optional[int] = None + application: Optional[str] = None + application_fee: Optional[str] = None + application_fee_amount: Optional[int] = None + balance_transaction: Optional[str] = None + billing_details: Optional[StripeBillingDetails] = None + calculated_statement_descriptor: Optional[str] = None + captured: Optional[bool] = None + created: Optional[int] = None + currency: Optional[str] = None + customer: Optional[str] = None + description: Optional[str] = None + destination: Optional[Any] = None + dispute: Optional[Any] = None + disputed: Optional[bool] = None + failure_balance_transaction: Optional[Any] = None + failure_code: Optional[Any] = None + failure_message: Optional[Any] = None + fraud_details: Optional[Dict[str, Any]] = None + id: Optional[str] = None + invoice: Optional[Any] = None + livemode: Optional[bool] = None + metadata: Optional[Dict[str, Any]] = None + object: Optional[Object1] = None + on_behalf_of: Optional[Any] = None + order: Optional[Any] = None + outcome: Optional[StripeOutcome] = None + paid: Optional[bool] = None + payment_intent: Optional[str] = None + payment_method: Optional[str] = None + payment_method_details: Optional[StripePaymentMethodDetails] = None + radar_options: Optional[Dict[str, Any]] = None + receipt_email: Optional[str] = None + receipt_number: Optional[str] = None + receipt_url: Optional[str] = None + refunded: Optional[bool] = None + refunds: Optional[StripeRefundList] = None + review: Optional[Any] = None + shipping: Optional[StripeShipping] = None + source: Optional[Any] = None + source_transfer: Optional[Any] = None + statement_descriptor: Optional[Any] = None + statement_descriptor_suffix: Optional[Any] = None + status: Optional[str] = None + transfer_data: Optional[Any] = None + transfer_group: Optional[Any] = None + + +class StripeChargeList(BaseModel): + data: Optional[List[StripeCharge]] = None + has_more: Optional[bool] = None + object: Optional[str] = None + total_count: Optional[int] = None + url: Optional[str] = None + + +class StripePaymentIntent(BaseModel): + amount: Optional[int] = None + amount_capturable: Optional[int] = None + amount_details: Optional[StripeAmountDetails] = None + amount_received: Optional[int] = None + application: Optional[str] = None + application_fee_amount: Optional[int] = None + automatic_payment_methods: Optional[Any] = None + canceled_at: Optional[int] = None + cancellation_reason: Optional[str] = None + capture_method: Optional[str] = None + charges: Optional[StripeChargeList] = None + client_secret: Optional[str] = None + confirmation_method: Optional[str] = None + created: Optional[int] = None + currency: Optional[str] = None + customer: Optional[str] = None + description: Optional[str] = None + id: Optional[str] = None + invoice: Optional[str] = None + last_payment_error: Optional[Any] = None + latest_charge: Optional[str] = None + livemode: Optional[bool] = None + metadata: Optional[Dict[str, Any]] = None + next_action: Optional[Any] = None + object: Optional[Object3] = None + on_behalf_of: Optional[Any] = None + payment_method: Optional[str] = None + payment_method_configuration_details: Optional[Any] = None + payment_method_options: Optional[StripePaymentMethodOptions] = None + payment_method_types: Optional[List[str]] = None + processing: Optional[Any] = None + receipt_email: Optional[str] = None + review: Optional[Any] = None + setup_future_usage: Optional[Any] = None + shipping: Optional[StripeShipping] = None + source: Optional[Any] = None + statement_descriptor: Optional[Any] = None + statement_descriptor_suffix: Optional[Any] = None + status: Optional[str] = None + transfer_data: Optional[Any] = None + transfer_group: Optional[Any] = None + + class TextResponseFormatConfiguration( RootModel[ Union[ @@ -3411,6 +5683,22 @@ class Tool( ] = Field(..., discriminator='type') +class BulkNodeVersionResult(BaseModel): + error_message: Optional[str] = Field( + None, + description='Error message if retrieval failed (only present if status is error)', + ) + identifier: NodeVersionIdentifier + node_version: Optional[NodeVersion] = None + status: Status = Field(..., description='Status of the retrieval operation') + + +class BulkNodeVersionsResponse(BaseModel): + node_versions: List[BulkNodeVersionResult] = Field( + ..., description='List of retrieved node versions with their status' + ) + + class EasyInputMessage(BaseModel): content: Union[str, InputMessageContentList] = Field( ..., @@ -3439,6 +5727,16 @@ class GeminiGenerateContentRequest(BaseModel): videoMetadata: Optional[GeminiVideoMetadata] = None +class GithubReleaseWebhook(BaseModel): + action: Action = Field(..., description='The action performed on the release') + enterprise: Optional[GithubEnterprise] = None + installation: Optional[GithubInstallation] = None + organization: Optional[GithubOrganization] = None + release: Release = Field(..., description='The release object') + repository: GithubRepository + sender: GithubUser + + class ImagenGenerateImageRequest(BaseModel): instances: List[ImagenImageGenerationInstance] parameters: ImagenImageGenerationParameters @@ -3447,8 +5745,8 @@ class ImagenGenerateImageRequest(BaseModel): class InputMessage(BaseModel): content: Optional[InputMessageContentList] = None role: Optional[Role3] = None - status: Optional[Status2] = None - type: Optional[Type9] = None + status: Optional[Status3] = None + type: Optional[Type10] = None class Item( @@ -3519,6 +5817,70 @@ class OutputItem( ] +class Publisher(BaseModel): + createdAt: Optional[datetime] = Field( + None, description='The date and time the publisher was created.' + ) + description: Optional[str] = None + id: Optional[str] = Field( + None, + description="The unique identifier for the publisher. It's akin to a username. Should be lowercase.", + ) + logo: Optional[str] = Field(None, description="URL to the publisher's logo.") + members: Optional[List[PublisherMember]] = Field( + None, description='A list of members in the publisher.' + ) + name: Optional[str] = None + source_code_repo: Optional[str] = None + status: Optional[PublisherStatus] = None + support: Optional[str] = None + website: Optional[str] = None + + +class RecraftGenerateImageResponse(BaseModel): + created: int + credits: int + data: List[RecraftImage] + + +class RecraftImageToImageRequest(BaseModel): + block_nsfw: Optional[bool] = None + calculate_features: Optional[bool] = None + controls: Optional[RecraftUserControls] = None + image: StrictBytes + image_format: Optional[RecraftImageFormat] = None + model: Optional[RecraftTransformModel] = None + n: Optional[int] = None + negative_prompt: Optional[str] = None + prompt: str + response_format: Optional[RecraftResponseFormat] = None + strength: float + style: Optional[RecraftImageStyle] = None + style_id: Optional[UUID] = None + substyle: Optional[RecraftImageSubStyle] = None + text_layout: Optional[RecraftTextLayout] = None + + +class ResponseOutputItemAddedEvent(BaseModel): + item: OutputItem + output_index: int = Field( + ..., description='The index of the output item that was added.\n' + ) + type: Type29 = Field( + ..., description='The type of the event. Always `response.output_item.added`.\n' + ) + + +class ResponseOutputItemDoneEvent(BaseModel): + item: OutputItem + output_index: int = Field( + ..., description='The index of the output item that was marked done.\n' + ) + type: Type30 = Field( + ..., description='The type of the event. Always `response.output_item.done`.\n' + ) + + class Text(BaseModel): format: Optional[TextResponseFormatConfiguration] = None @@ -3552,6 +5914,28 @@ class ResponseProperties(BaseModel): ) +class Rodin3DCheckStatusResponse(BaseModel): + jobs: Optional[List[RodinCheckStatusJobItem]] = Field( + None, description='Details for the generation status.' + ) + + +class Data8(BaseModel): + object: Optional[StripePaymentIntent] = None + + +class StripeEvent(BaseModel): + api_version: Optional[str] = None + created: Optional[int] = None + data: Data8 + id: str + livemode: Optional[bool] = None + object: Object2 + pending_webhooks: Optional[int] = None + request: Optional[StripeRequestInfo] = None + type: Type31 + + class GeminiCandidate(BaseModel): citationMetadata: Optional[GeminiCitationMetadata] = None content: Optional[GeminiContent] = None @@ -3562,12 +5946,67 @@ class GeminiCandidate(BaseModel): class GeminiGenerateContentResponse(BaseModel): candidates: Optional[List[GeminiCandidate]] = None promptFeedback: Optional[GeminiPromptFeedback] = None + usageMetadata: Optional[GeminiUsageMetadata] = None class InputItem(RootModel[Union[EasyInputMessage, Item]]): root: Union[EasyInputMessage, Item] +class Node(BaseModel): + author: Optional[str] = None + banner_url: Optional[str] = Field(None, description="URL to the node's banner.") + category: Optional[str] = Field(None, description='The category of the node.') + created_at: Optional[datetime] = Field( + None, description='The date and time when the node was created' + ) + description: Optional[str] = None + downloads: Optional[int] = Field( + None, description='The number of downloads of the node.' + ) + github_stars: Optional[int] = Field( + None, description='Number of stars on the GitHub repository.' + ) + icon: Optional[str] = Field(None, description="URL to the node's icon.") + id: Optional[str] = Field(None, description='The unique identifier of the node.') + latest_version: Optional[NodeVersion] = None + license: Optional[str] = Field( + None, description="The path to the LICENSE file in the node's repository." + ) + name: Optional[str] = Field(None, description='The display name of the node.') + preempted_comfy_node_names: Optional[List[str]] = Field( + None, description='A list of Comfy node names that are preempted by this node.' + ) + publisher: Optional[Publisher] = None + rating: Optional[float] = Field(None, description='The average rating of the node.') + repository: Optional[str] = Field(None, description="URL to the node's repository.") + search_ranking: Optional[int] = Field( + None, + description="A numerical value representing the node's search ranking, used for sorting search results.", + ) + status: Optional[NodeStatus] = None + status_detail: Optional[str] = Field( + None, description='The status detail of the node.' + ) + supported_accelerators: Optional[List[str]] = Field( + None, + description='List of accelerators (e.g. CUDA, DirectML, ROCm) that this node supports', + ) + supported_comfyui_frontend_version: Optional[str] = Field( + None, description='Supported versions of ComfyUI frontend' + ) + supported_comfyui_version: Optional[str] = Field( + None, description='Supported versions of ComfyUI' + ) + supported_os: Optional[List[str]] = Field( + None, description='List of operating systems that this node supports' + ) + tags: Optional[List[str]] = None + translations: Optional[Dict[str, Dict[str, Any]]] = Field( + None, description='Translations of node metadata in different languages.' + ) + + class OpenAICreateResponse(CreateModelResponseProperties, ResponseProperties): include: Optional[List[Includable]] = Field( None, @@ -3615,8 +6054,73 @@ class OpenAIResponse(ModelResponseProperties, ResponseProperties): parallel_tool_calls: Optional[bool] = Field( True, description='Whether to allow the model to run tool calls in parallel.\n' ) - status: Optional[Status6] = Field( + status: Optional[Status7] = Field( None, description='The status of the response generation. One of `completed`, `failed`, `in_progress`, or `incomplete`.', ) usage: Optional[ResponseUsage] = None + + +class ResponseCompletedEvent(BaseModel): + response: OpenAIResponse + type: Type19 = Field( + ..., description='The type of the event. Always `response.completed`.' + ) + + +class ResponseCreatedEvent(BaseModel): + response: OpenAIResponse + type: Type22 = Field( + ..., description='The type of the event. Always `response.created`.' + ) + + +class ResponseFailedEvent(BaseModel): + response: OpenAIResponse + type: Type24 = Field( + ..., description='The type of the event. Always `response.failed`.\n' + ) + + +class ResponseInProgressEvent(BaseModel): + response: OpenAIResponse + type: Type27 = Field( + ..., description='The type of the event. Always `response.in_progress`.\n' + ) + + +class ResponseIncompleteEvent(BaseModel): + response: OpenAIResponse + type: Type28 = Field( + ..., description='The type of the event. Always `response.incomplete`.\n' + ) + + +class OpenAIResponseStreamEvent( + RootModel[ + Union[ + ResponseCreatedEvent, + ResponseInProgressEvent, + ResponseCompletedEvent, + ResponseFailedEvent, + ResponseIncompleteEvent, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, + ResponseErrorEvent, + ] + ] +): + root: Union[ + ResponseCreatedEvent, + ResponseInProgressEvent, + ResponseCompletedEvent, + ResponseFailedEvent, + ResponseIncompleteEvent, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, + ResponseErrorEvent, + ] = Field(..., description='Events that can be emitted during response streaming') diff --git a/comfy_api_nodes/apis/tripo_api.py b/comfy_api_nodes/apis/tripo_api.py index 626e8d277..9f43d4d09 100644 --- a/comfy_api_nodes/apis/tripo_api.py +++ b/comfy_api_nodes/apis/tripo_api.py @@ -127,7 +127,7 @@ class TripoTextToModelRequest(BaseModel): type: TripoTaskType = Field(TripoTaskType.TEXT_TO_MODEL, description='Type of task') prompt: str = Field(..., description='The text prompt describing the model to generate', max_length=1024) negative_prompt: Optional[str] = Field(None, description='The negative text prompt', max_length=1024) - model_version: Optional[TripoModelVersion] = TripoModelVersion.V2_5 + model_version: Optional[TripoModelVersion] = TripoModelVersion.v2_5_20250123 face_limit: Optional[int] = Field(None, description='The number of faces to limit the generation to') texture: Optional[bool] = Field(True, description='Whether to apply texture to the generated model') pbr: Optional[bool] = Field(True, description='Whether to apply PBR to the generated model') diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index df846d5dd..97bfe20e6 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -8,10 +8,10 @@ from typing import Optional from comfy.comfy_types.node_typing import IO, ComfyNodeABC from comfy_api.input_impl.video_types import VideoFromFile from comfy_api_nodes.apis import ( - Veo2GenVidRequest, - Veo2GenVidResponse, - Veo2GenVidPollRequest, - Veo2GenVidPollResponse + VeoGenVidRequest, + VeoGenVidResponse, + VeoGenVidPollRequest, + VeoGenVidPollResponse ) from comfy_api_nodes.apis.client import ( ApiEndpoint, @@ -35,7 +35,7 @@ def convert_image_to_base64(image: torch.Tensor): return tensor_to_base64_string(scaled_image) -def get_video_url_from_response(poll_response: Veo2GenVidPollResponse) -> Optional[str]: +def get_video_url_from_response(poll_response: VeoGenVidPollResponse) -> Optional[str]: if ( poll_response.response and hasattr(poll_response.response, "videos") @@ -130,6 +130,14 @@ class VeoVideoGenerationNode(ComfyNodeABC): "default": None, "tooltip": "Optional reference image to guide video generation", }), + "model": ( + IO.COMBO, + { + "options": ["veo-2.0-generate-001"], + "default": "veo-2.0-generate-001", + "tooltip": "Veo 2 model to use for video generation", + }, + ), }, "hidden": { "auth_token": "AUTH_TOKEN_COMFY_ORG", @@ -141,7 +149,7 @@ class VeoVideoGenerationNode(ComfyNodeABC): RETURN_TYPES = (IO.VIDEO,) FUNCTION = "generate_video" CATEGORY = "api node/video/Veo" - DESCRIPTION = "Generates videos from text prompts using Google's Veo API" + DESCRIPTION = "Generates videos from text prompts using Google's Veo 2 API" API_NODE = True def generate_video( @@ -154,6 +162,8 @@ class VeoVideoGenerationNode(ComfyNodeABC): person_generation="ALLOW", seed=0, image=None, + model="veo-2.0-generate-001", + generate_audio=False, unique_id: Optional[str] = None, **kwargs, ): @@ -188,16 +198,19 @@ class VeoVideoGenerationNode(ComfyNodeABC): parameters["negativePrompt"] = negative_prompt if seed > 0: parameters["seed"] = seed + # Only add generateAudio for Veo 3 models + if "veo-3.0" in model: + parameters["generateAudio"] = generate_audio # Initial request to start video generation initial_operation = SynchronousOperation( endpoint=ApiEndpoint( - path="/proxy/veo/generate", + path=f"/proxy/veo/{model}/generate", method=HttpMethod.POST, - request_model=Veo2GenVidRequest, - response_model=Veo2GenVidResponse + request_model=VeoGenVidRequest, + response_model=VeoGenVidResponse ), - request=Veo2GenVidRequest( + request=VeoGenVidRequest( instances=instances, parameters=parameters ), @@ -223,16 +236,16 @@ class VeoVideoGenerationNode(ComfyNodeABC): # Define the polling operation poll_operation = PollingOperation( poll_endpoint=ApiEndpoint( - path="/proxy/veo/poll", + path=f"/proxy/veo/{model}/poll", method=HttpMethod.POST, - request_model=Veo2GenVidPollRequest, - response_model=Veo2GenVidPollResponse + request_model=VeoGenVidPollRequest, + response_model=VeoGenVidPollResponse ), completed_statuses=["completed"], failed_statuses=[], # No failed statuses, we'll handle errors after polling status_extractor=status_extractor, progress_extractor=progress_extractor, - request=Veo2GenVidPollRequest( + request=VeoGenVidPollRequest( operationName=operation_name ), auth_kwargs=kwargs, @@ -298,11 +311,64 @@ class VeoVideoGenerationNode(ComfyNodeABC): return (VideoFromFile(video_io),) -# Register the node +class Veo3VideoGenerationNode(VeoVideoGenerationNode): + """ + Generates videos from text prompts using Google's Veo 3 API. + + Supported models: + - veo-3.0-generate-001 + - veo-3.0-fast-generate-001 + + This node extends the base Veo node with Veo 3 specific features including + audio generation and fixed 8-second duration. + """ + + @classmethod + def INPUT_TYPES(s): + parent_input = super().INPUT_TYPES() + + # Update model options for Veo 3 + parent_input["optional"]["model"] = ( + IO.COMBO, + { + "options": ["veo-3.0-generate-001", "veo-3.0-fast-generate-001"], + "default": "veo-3.0-generate-001", + "tooltip": "Veo 3 model to use for video generation", + }, + ) + + # Add generateAudio parameter + parent_input["optional"]["generate_audio"] = ( + IO.BOOLEAN, + { + "default": False, + "tooltip": "Generate audio for the video. Supported by all Veo 3 models.", + } + ) + + # Update duration constraints for Veo 3 (only 8 seconds supported) + parent_input["optional"]["duration_seconds"] = ( + IO.INT, + { + "default": 8, + "min": 8, + "max": 8, + "step": 1, + "display": "number", + "tooltip": "Duration of the output video in seconds (Veo 3 only supports 8 seconds)", + }, + ) + + return parent_input + + +# Register the nodes NODE_CLASS_MAPPINGS = { "VeoVideoGenerationNode": VeoVideoGenerationNode, + "Veo3VideoGenerationNode": Veo3VideoGenerationNode, } NODE_DISPLAY_NAME_MAPPINGS = { - "VeoVideoGenerationNode": "Google Veo2 Video Generation", + "VeoVideoGenerationNode": "Google Veo 2 Video Generation", + "Veo3VideoGenerationNode": "Google Veo 3 Video Generation", } From 5be6fd09ffb46cfdff240fb5b96dd8c06b2a0344 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 5 Aug 2025 15:48:56 +0800 Subject: [PATCH 099/759] Update template to 0.1.48 (#9182) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ffa7dce65..470060ab4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.47 +comfyui-workflow-templates==0.1.48 comfyui-embedded-docs==0.2.4 torch torchsde From d044a243986700aae552acdebf7e767ae8282e37 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 5 Aug 2025 03:12:27 -0700 Subject: [PATCH 100/759] Fix default shift and any latent size for qwen image model. (#9186) --- comfy/ldm/qwen_image/model.py | 9 +++++---- comfy/supported_models.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index ff631a60f..c15ab8e40 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -8,7 +8,7 @@ from einops import repeat from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps from comfy.ldm.modules.attention import optimized_attention_masked from comfy.ldm.flux.layers import EmbedND - +import comfy.ldm.common_dit class GELU(nn.Module): def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True, dtype=None, device=None, operations=None): @@ -364,8 +364,9 @@ class QwenImageTransformer2DModel(nn.Module): image_rotary_emb = self.pos_embeds(x, context) - orig_shape = x.shape - hidden_states = x.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) + hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (1, self.patch_size, self.patch_size)) + orig_shape = hidden_states.shape + hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5) hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4) @@ -396,4 +397,4 @@ class QwenImageTransformer2DModel(nn.Module): hidden_states = hidden_states.view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2) hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5) - return hidden_states.reshape(orig_shape) + return hidden_states.reshape(orig_shape)[:, :, :, :x.shape[-2], :x.shape[-1]] diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 880055bd3..156ff9e26 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1237,7 +1237,7 @@ class QwenImage(supported_models_base.BASE): sampling_settings = { "multiplier": 1.0, - "shift": 2.6, + "shift": 1.15, } memory_usage_factor = 1.8 #TODO From da1ad9b5163fb848f3ec87ccc4fd0c8069f6eff0 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 5 Aug 2025 19:24:12 +0800 Subject: [PATCH 101/759] Update template to 0.1.51 (#9187) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 470060ab4..9a6b04cf2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.48 +comfyui-workflow-templates==0.1.51 comfyui-embedded-docs==0.2.4 torch torchsde From 32a95bba8ac91e8a610c35ce4d9963d2453118c1 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 5 Aug 2025 07:33:02 -0400 Subject: [PATCH 102/759] ComfyUI version 0.3.49 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 7b29e338d..5e2d09c81 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.48" +__version__ = "0.3.49" diff --git a/pyproject.toml b/pyproject.toml index 256677fad..3c530ba85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.48" +version = "0.3.49" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From d8c51ba15aef6b0df86a7ea0203881be55d7579b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 5 Aug 2025 04:41:18 -0700 Subject: [PATCH 103/759] Add Qwen Image model to readme. (#9191) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2abd8e600..119098f5c 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/) - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/) - [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/) + - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/) - Image Editing Models - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/) - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model) From 9126c0cfe49508a64c429f97b45664b241aab3f2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 6 Aug 2025 01:07:04 -0700 Subject: [PATCH 104/759] Qwen Image model merging node. (#9202) --- .../nodes_model_merging_model_specific.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/comfy_extras/nodes_model_merging_model_specific.py b/comfy_extras/nodes_model_merging_model_specific.py index 2c93cd84f..55eb3ccfe 100644 --- a/comfy_extras/nodes_model_merging_model_specific.py +++ b/comfy_extras/nodes_model_merging_model_specific.py @@ -314,6 +314,29 @@ class ModelMergeCosmosPredict2_14B(comfy_extras.nodes_model_merging.ModelMergeBl return {"required": arg_dict} +class ModelMergeQwenImage(comfy_extras.nodes_model_merging.ModelMergeBlocks): + CATEGORY = "advanced/model_merging/model_specific" + + @classmethod + def INPUT_TYPES(s): + arg_dict = { "model1": ("MODEL",), + "model2": ("MODEL",)} + + argument = ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}) + + arg_dict["pos_embeds."] = argument + arg_dict["img_in."] = argument + arg_dict["txt_norm."] = argument + arg_dict["txt_in."] = argument + arg_dict["time_text_embed."] = argument + + for i in range(60): + arg_dict["transformer_blocks.{}.".format(i)] = argument + + arg_dict["proj_out."] = argument + + return {"required": arg_dict} + NODE_CLASS_MAPPINGS = { "ModelMergeSD1": ModelMergeSD1, "ModelMergeSD2": ModelMergeSD1, #SD1 and SD2 have the same blocks @@ -329,4 +352,5 @@ NODE_CLASS_MAPPINGS = { "ModelMergeWAN2_1": ModelMergeWAN2_1, "ModelMergeCosmosPredict2_2B": ModelMergeCosmosPredict2_2B, "ModelMergeCosmosPredict2_14B": ModelMergeCosmosPredict2_14B, + "ModelMergeQwenImage": ModelMergeQwenImage, } From 4c3e57b0ae9fb7ff1322977915efe7e98544d15d Mon Sep 17 00:00:00 2001 From: flybirdxx <1119577418@qq.com> Date: Thu, 7 Aug 2025 01:23:11 +0800 Subject: [PATCH 105/759] Fixed an issue where qwenLora could not be loaded properly. (#9208) --- comfy/lora.py | 9 +++++++++ comfy/weight_adapter/lora.py | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/comfy/lora.py b/comfy/lora.py index 387d5c52a..6686b7229 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -293,6 +293,15 @@ def model_lora_keys_unet(model, key_map={}): key_lora = k[len("diffusion_model."):-len(".weight")] key_map["{}".format(key_lora)] = k + if isinstance(model, comfy.model_base.QwenImage): + for k in sdk: + if k.startswith("diffusion_model.") and k.endswith(".weight"): #QwenImage lora format + key_lora = k[len("diffusion_model."):-len(".weight")] + # Direct mapping for transformer_blocks format (QwenImage LoRA format) + key_map["{}".format(key_lora)] = k + # Support transformer prefix format + key_map["transformer.{}".format(key_lora)] = k + return key_map diff --git a/comfy/weight_adapter/lora.py b/comfy/weight_adapter/lora.py index 729dbd9e6..47aa17d13 100644 --- a/comfy/weight_adapter/lora.py +++ b/comfy/weight_adapter/lora.py @@ -96,6 +96,7 @@ class LoRAAdapter(WeightAdapterBase): diffusers3_lora = "{}.lora.up.weight".format(x) mochi_lora = "{}.lora_B".format(x) transformers_lora = "{}.lora_linear_layer.up.weight".format(x) + qwen_default_lora = "{}.lora_B.default.weight".format(x) A_name = None if regular_lora in lora.keys(): @@ -122,6 +123,10 @@ class LoRAAdapter(WeightAdapterBase): A_name = transformers_lora B_name = "{}.lora_linear_layer.down.weight".format(x) mid_name = None + elif qwen_default_lora in lora.keys(): + A_name = qwen_default_lora + B_name = "{}.lora_A.default.weight".format(x) + mid_name = None if A_name is not None: mid = None From 32691b16f4e1a897e461e77f9d6dceba2d6f0cd1 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Thu, 7 Aug 2025 01:26:29 +0800 Subject: [PATCH 106/759] Update template to 0.1.52 (#9206) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9a6b04cf2..d6926d610 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.23.4 -comfyui-workflow-templates==0.1.51 +comfyui-workflow-templates==0.1.52 comfyui-embedded-docs==0.2.4 torch torchsde From 37d620a6b85f61b824363ed8170db373726ca45a Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Wed, 6 Aug 2025 16:52:39 -0700 Subject: [PATCH 107/759] Update frontend to v1.24.3 (#9175) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d6926d610..2f4692b03 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.23.4 +comfyui-frontend-package==1.24.4 comfyui-workflow-templates==0.1.52 comfyui-embedded-docs==0.2.4 torch From 05df2df489f6b237f63c5f7d42a943ae2be417e9 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 7 Aug 2025 08:20:40 -0700 Subject: [PATCH 108/759] Fix RepeatLatentBatch not working on multi dim latents. (#9227) --- nodes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nodes.py b/nodes.py index 9bedbcaca..9448f9c1b 100644 --- a/nodes.py +++ b/nodes.py @@ -1229,12 +1229,12 @@ class RepeatLatentBatch: s = samples.copy() s_in = samples["samples"] - s["samples"] = s_in.repeat((amount, 1,1,1)) + s["samples"] = s_in.repeat((amount,) + ((1,) * (s_in.ndim - 1))) if "noise_mask" in samples and samples["noise_mask"].shape[0] > 1: masks = samples["noise_mask"] if masks.shape[0] < s_in.shape[0]: - masks = masks.repeat(math.ceil(s_in.shape[0] / masks.shape[0]), 1, 1, 1)[:s_in.shape[0]] - s["noise_mask"] = samples["noise_mask"].repeat((amount, 1,1,1)) + masks = masks.repeat((math.ceil(s_in.shape[0] / masks.shape[0]),) + ((1,) * (masks.ndim - 1)))[:s_in.shape[0]] + s["noise_mask"] = samples["noise_mask"].repeat((amount,) + ((1,) * (samples["noise_mask"].ndim - 1))) if "batch_index" in s: offset = max(s["batch_index"]) - min(s["batch_index"]) + 1 s["batch_index"] = s["batch_index"] + [x + (i * offset) for i in range(1, amount) for x in s["batch_index"]] From 42974a448c39af50c5f72d8c70267f9fe2971cd2 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Thu, 7 Aug 2025 14:54:09 -0700 Subject: [PATCH 109/759] _ui.py import torchaudio safety check (#9234) * Added safety around torchaudio import in _ui.py * Trusted cursor too much, fixed torchaudio bool --- comfy_api/latest/_ui.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py index 6b8a39d58..61597038f 100644 --- a/comfy_api/latest/_ui.py +++ b/comfy_api/latest/_ui.py @@ -9,7 +9,11 @@ from typing import Type import av import numpy as np import torch -import torchaudio +try: + import torchaudio + TORCH_AUDIO_AVAILABLE = True +except ImportError: + TORCH_AUDIO_AVAILABLE = False from PIL import Image as PILImage from PIL.PngImagePlugin import PngInfo @@ -302,6 +306,8 @@ class AudioSaveHelper: # Resample if necessary if sample_rate != audio["sample_rate"]: + if not TORCH_AUDIO_AVAILABLE: + raise Exception("torchaudio is not available; cannot resample audio.") waveform = torchaudio.functional.resample(waveform, audio["sample_rate"], sample_rate) # Create output with specified format From bf2a1b5b1ef72b240454f3ac44f5209af45efe00 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 8 Aug 2025 06:37:50 +0300 Subject: [PATCH 110/759] async API nodes (#9129) * converted API nodes to async * converted BFL API nodes to async * fixed client bug; converted gemini, ideogram, minimax * fixed client bug; converted openai nodes * fixed client bug; converted moonvalley, pika nodes * fixed client bug; converted kling, luma nodes * converted pixverse, rodin nodes * converted tripo, veo2 * converted recraft nodes * add lost log_request_response call --- comfy_api_nodes/apinode_utils.py | 152 ++--- comfy_api_nodes/apis/client.py | 901 +++++++++++----------------- comfy_api_nodes/nodes_bfl.py | 134 +++-- comfy_api_nodes/nodes_gemini.py | 4 +- comfy_api_nodes/nodes_ideogram.py | 22 +- comfy_api_nodes/nodes_kling.py | 130 ++-- comfy_api_nodes/nodes_luma.py | 70 ++- comfy_api_nodes/nodes_minimax.py | 14 +- comfy_api_nodes/nodes_moonvalley.py | 38 +- comfy_api_nodes/nodes_openai.py | 28 +- comfy_api_nodes/nodes_pika.py | 63 +- comfy_api_nodes/nodes_pixverse.py | 46 +- comfy_api_nodes/nodes_recraft.py | 44 +- comfy_api_nodes/nodes_rodin.py | 147 ++--- comfy_api_nodes/nodes_runway.py | 54 +- comfy_api_nodes/nodes_stability.py | 23 +- comfy_api_nodes/nodes_tripo.py | 69 ++- comfy_api_nodes/nodes_veo2.py | 15 +- 18 files changed, 878 insertions(+), 1076 deletions(-) diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py index 788e2803f..f953f86df 100644 --- a/comfy_api_nodes/apinode_utils.py +++ b/comfy_api_nodes/apinode_utils.py @@ -1,4 +1,5 @@ from __future__ import annotations +import aiohttp import io import logging import mimetypes @@ -21,7 +22,6 @@ from server import PromptServer import numpy as np from PIL import Image -import requests import torch import math import base64 @@ -30,7 +30,7 @@ from io import BytesIO import av -def download_url_to_video_output(video_url: str, timeout: int = None) -> VideoFromFile: +async def download_url_to_video_output(video_url: str, timeout: int = None) -> VideoFromFile: """Downloads a video from a URL and returns a `VIDEO` output. Args: @@ -39,7 +39,7 @@ def download_url_to_video_output(video_url: str, timeout: int = None) -> VideoFr Returns: A Comfy node `VIDEO` output. """ - video_io = download_url_to_bytesio(video_url, timeout) + video_io = await download_url_to_bytesio(video_url, timeout) if video_io is None: error_msg = f"Failed to download video from {video_url}" logging.error(error_msg) @@ -62,7 +62,7 @@ def downscale_image_tensor(image, total_pixels=1536 * 1024) -> torch.Tensor: return s -def validate_and_cast_response( +async def validate_and_cast_response( response, timeout: int = None, node_id: Union[str, None] = None ) -> torch.Tensor: """Validates and casts a response to a torch.Tensor. @@ -86,35 +86,24 @@ def validate_and_cast_response( image_tensors: list[torch.Tensor] = [] # Process each image in the data array - for image_data in data: - image_url = image_data.url - b64_data = image_data.b64_json + async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=timeout)) as session: + for img_data in data: + img_bytes: bytes + if img_data.b64_json: + img_bytes = base64.b64decode(img_data.b64_json) + elif img_data.url: + if node_id: + PromptServer.instance.send_progress_text(f"Result URL: {img_data.url}", node_id) + async with session.get(img_data.url) as resp: + if resp.status != 200: + raise ValueError("Failed to download generated image") + img_bytes = await resp.read() + else: + raise ValueError("Invalid image payload – neither URL nor base64 data present.") - if not image_url and not b64_data: - raise ValueError("No image was generated in the response") - - if b64_data: - img_data = base64.b64decode(b64_data) - img = Image.open(io.BytesIO(img_data)) - - elif image_url: - if node_id: - PromptServer.instance.send_progress_text( - f"Result URL: {image_url}", node_id - ) - img_response = requests.get(image_url, timeout=timeout) - if img_response.status_code != 200: - raise ValueError("Failed to download the image") - img = Image.open(io.BytesIO(img_response.content)) - - img = img.convert("RGBA") - - # Convert to numpy array, normalize to float32 between 0 and 1 - img_array = np.array(img).astype(np.float32) / 255.0 - img_tensor = torch.from_numpy(img_array) - - # Add to list of tensors - image_tensors.append(img_tensor) + pil_img = Image.open(BytesIO(img_bytes)).convert("RGBA") + arr = np.asarray(pil_img).astype(np.float32) / 255.0 + image_tensors.append(torch.from_numpy(arr)) return torch.stack(image_tensors, dim=0) @@ -175,7 +164,7 @@ def mimetype_to_extension(mime_type: str) -> str: return mime_type.split("/")[-1].lower() -def download_url_to_bytesio(url: str, timeout: int = None) -> BytesIO: +async def download_url_to_bytesio(url: str, timeout: int = None) -> BytesIO: """Downloads content from a URL using requests and returns it as BytesIO. Args: @@ -185,9 +174,11 @@ def download_url_to_bytesio(url: str, timeout: int = None) -> BytesIO: Returns: BytesIO object containing the downloaded content. """ - response = requests.get(url, stream=True, timeout=timeout) - response.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX) - return BytesIO(response.content) + timeout_cfg = aiohttp.ClientTimeout(total=timeout) if timeout else None + async with aiohttp.ClientSession(timeout=timeout_cfg) as session: + async with session.get(url) as resp: + resp.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX) + return BytesIO(await resp.read()) def bytesio_to_image_tensor(image_bytesio: BytesIO, mode: str = "RGBA") -> torch.Tensor: @@ -210,15 +201,15 @@ def bytesio_to_image_tensor(image_bytesio: BytesIO, mode: str = "RGBA") -> torch return torch.from_numpy(image_array).unsqueeze(0) -def download_url_to_image_tensor(url: str, timeout: int = None) -> torch.Tensor: +async def download_url_to_image_tensor(url: str, timeout: int = None) -> torch.Tensor: """Downloads an image from a URL and returns a [B, H, W, C] tensor.""" - image_bytesio = download_url_to_bytesio(url, timeout) + image_bytesio = await download_url_to_bytesio(url, timeout) return bytesio_to_image_tensor(image_bytesio) -def process_image_response(response: requests.Response) -> torch.Tensor: +def process_image_response(response_content: bytes | str) -> torch.Tensor: """Uses content from a Response object and converts it to a torch.Tensor""" - return bytesio_to_image_tensor(BytesIO(response.content)) + return bytesio_to_image_tensor(BytesIO(response_content)) def _tensor_to_pil(image: torch.Tensor, total_pixels: int = 2048 * 2048) -> Image.Image: @@ -336,10 +327,10 @@ def text_filepath_to_data_uri(filepath: str) -> str: return f"data:{mime_type};base64,{base64_string}" -def upload_file_to_comfyapi( +async def upload_file_to_comfyapi( file_bytes_io: BytesIO, filename: str, - upload_mime_type: str, + upload_mime_type: Optional[str], auth_kwargs: Optional[dict[str, str]] = None, ) -> str: """ @@ -354,7 +345,10 @@ def upload_file_to_comfyapi( Returns: The download URL for the uploaded file. """ - request_object = UploadRequest(file_name=filename, content_type=upload_mime_type) + if upload_mime_type is None: + request_object = UploadRequest(file_name=filename) + else: + request_object = UploadRequest(file_name=filename, content_type=upload_mime_type) operation = SynchronousOperation( endpoint=ApiEndpoint( path="/customers/storage", @@ -366,12 +360,8 @@ def upload_file_to_comfyapi( auth_kwargs=auth_kwargs, ) - response: UploadResponse = operation.execute() - upload_response = ApiClient.upload_file( - response.upload_url, file_bytes_io, content_type=upload_mime_type - ) - upload_response.raise_for_status() - + response: UploadResponse = await operation.execute() + await ApiClient.upload_file(response.upload_url, file_bytes_io, content_type=upload_mime_type) return response.download_url @@ -399,7 +389,7 @@ def video_to_base64_string( return base64.b64encode(video_bytes_io.getvalue()).decode("utf-8") -def upload_video_to_comfyapi( +async def upload_video_to_comfyapi( video: VideoInput, auth_kwargs: Optional[dict[str, str]] = None, container: VideoContainer = VideoContainer.MP4, @@ -439,9 +429,7 @@ def upload_video_to_comfyapi( video.save_to(video_bytes_io, format=container, codec=codec) video_bytes_io.seek(0) - return upload_file_to_comfyapi( - video_bytes_io, filename, upload_mime_type, auth_kwargs - ) + return await upload_file_to_comfyapi(video_bytes_io, filename, upload_mime_type, auth_kwargs) def audio_tensor_to_contiguous_ndarray(waveform: torch.Tensor) -> np.ndarray: @@ -501,7 +489,7 @@ def audio_ndarray_to_bytesio( return audio_bytes_io -def upload_audio_to_comfyapi( +async def upload_audio_to_comfyapi( audio: AudioInput, auth_kwargs: Optional[dict[str, str]] = None, container_format: str = "mp4", @@ -527,7 +515,7 @@ def upload_audio_to_comfyapi( audio_data_np, sample_rate, container_format, codec_name ) - return upload_file_to_comfyapi(audio_bytes_io, filename, mime_type, auth_kwargs) + return await upload_file_to_comfyapi(audio_bytes_io, filename, mime_type, auth_kwargs) def audio_to_base64_string( @@ -544,7 +532,7 @@ def audio_to_base64_string( return base64.b64encode(audio_bytes).decode("utf-8") -def upload_images_to_comfyapi( +async def upload_images_to_comfyapi( image: torch.Tensor, max_images=8, auth_kwargs: Optional[dict[str, str]] = None, @@ -561,55 +549,15 @@ def upload_images_to_comfyapi( mime_type: Optional MIME type for the image. """ # if batch, try to upload each file if max_images is greater than 0 - idx_image = 0 download_urls: list[str] = [] is_batch = len(image.shape) > 3 - batch_length = 1 - if is_batch: - batch_length = image.shape[0] - while True: - curr_image = image - if len(image.shape) > 3: - curr_image = image[idx_image] - # get BytesIO version of image - img_binary = tensor_to_bytesio(curr_image, mime_type=mime_type) - # first, request upload/download urls from comfy API - if not mime_type: - request_object = UploadRequest(file_name=img_binary.name) - else: - request_object = UploadRequest( - file_name=img_binary.name, content_type=mime_type - ) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/customers/storage", - method=HttpMethod.POST, - request_model=UploadRequest, - response_model=UploadResponse, - ), - request=request_object, - auth_kwargs=auth_kwargs, - ) - response = operation.execute() + batch_len = image.shape[0] if is_batch else 1 - upload_response = ApiClient.upload_file( - response.upload_url, img_binary, content_type=mime_type - ) - # verify success - try: - upload_response.raise_for_status() - except requests.exceptions.HTTPError as e: - raise ValueError(f"Could not upload one or more images: {e}") from e - # add download_url to list - download_urls.append(response.download_url) - - idx_image += 1 - # stop uploading additional files if done - if is_batch and max_images > 0: - if idx_image >= max_images: - break - if idx_image >= batch_length: - break + for idx in range(min(batch_len, max_images)): + tensor = image[idx] if is_batch else image + img_io = tensor_to_bytesio(tensor, mime_type=mime_type) + url = await upload_file_to_comfyapi(img_io, img_io.name, mime_type, auth_kwargs) + download_urls.append(url) return download_urls diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py index 2a4bac88b..4ad0b783b 100644 --- a/comfy_api_nodes/apis/client.py +++ b/comfy_api_nodes/apis/client.py @@ -43,7 +43,7 @@ operation = ApiOperation( endpoint=user_info_endpoint, request=request ) -user_profile = operation.execute(client=api_client) # Returns immediately with the result +user_profile = await operation.execute(client=api_client) # Returns immediately with the result # Example 2: Asynchronous API Operation with Polling @@ -87,18 +87,19 @@ operation = PollingOperation( ) # This will make the initial request and then poll until completion -result = operation.execute(client=api_client) # Returns the final ImageGenerationResult when done +result = await operation.execute(client=api_client) # Returns the final ImageGenerationResult when done """ from __future__ import annotations +import aiohttp +import asyncio import logging -import time import io import socket +from aiohttp.client_exceptions import ClientError, ClientResponseError from typing import Dict, Type, Optional, Any, TypeVar, Generic, Callable, Tuple from enum import Enum import json -import requests from urllib.parse import urljoin, urlparse from pydantic import BaseModel, Field import uuid # For generating unique operation IDs @@ -174,6 +175,7 @@ class ApiClient: retry_delay: float = 1.0, retry_backoff_factor: float = 2.0, retry_status_codes: Optional[Tuple[int, ...]] = None, + session: Optional[aiohttp.ClientSession] = None, ): self.base_url = base_url self.auth_token = auth_token @@ -186,13 +188,16 @@ class ApiClient: # Default retry status codes: 408 (Request Timeout), 429 (Too Many Requests), # 500, 502, 503, 504 (Server Errors) self.retry_status_codes = retry_status_codes or (408, 429, 500, 502, 503, 504) + self._session: Optional[aiohttp.ClientSession] = session + self._owns_session = session is None # Track if we have to close it - def _generate_operation_id(self, path: str) -> str: + @staticmethod + def _generate_operation_id(path: str) -> str: """Generates a unique operation ID for logging.""" return f"{path.strip('/').replace('/', '_')}_{uuid.uuid4().hex[:8]}" + @staticmethod def _create_json_payload_args( - self, data: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, ) -> Dict[str, Any]: @@ -203,31 +208,53 @@ class ApiClient: def _create_form_data_args( self, - data: Dict[str, Any], - files: Dict[str, Any], + data: Dict[str, Any] | None, + files: Dict[str, Any] | None, headers: Optional[Dict[str, str]] = None, - multipart_parser = None, + multipart_parser: Callable | None = None, ) -> Dict[str, Any]: if headers and "Content-Type" in headers: del headers["Content-Type"] - if multipart_parser: + if multipart_parser and data: data = multipart_parser(data) - return { - "data": data, - "files": files, - "headers": headers, - } + form = aiohttp.FormData(default_to_multipart=True) + if data: # regular text fields + for k, v in data.items(): + if v is None: + continue # aiohttp fails to serialize "None" values + # aiohttp expects strings or bytes; convert enums etc. + form.add_field(k, str(v) if not isinstance(v, (bytes, bytearray)) else v) + if files: + file_iter = files if isinstance(files, list) else files.items() + for field_name, file_obj in file_iter: + if file_obj is None: + continue # aiohttp fails to serialize "None" values + # file_obj can be (filename, bytes/io.BytesIO, content_type) tuple + if isinstance(file_obj, tuple): + filename, file_value, content_type = self._unpack_tuple(file_obj) + else: + file_value = file_obj + filename = getattr(file_obj, "name", field_name) + content_type = "application/octet-stream" + + form.add_field( + name=field_name, + value=file_value, + filename=filename, + content_type=content_type, + ) + return {"data": form, "headers": headers or {}} + + @staticmethod def _create_urlencoded_form_data_args( - self, data: Dict[str, Any], headers: Optional[Dict[str, str]] = None, ) -> Dict[str, Any]: headers = headers or {} headers["Content-Type"] = "application/x-www-form-urlencoded" - return { "data": data, "headers": headers, @@ -244,7 +271,7 @@ class ApiClient: return headers - def _check_connectivity(self, target_url: str) -> Dict[str, bool]: + async def _check_connectivity(self, target_url: str) -> Dict[str, bool]: """ Check connectivity to determine if network issues are local or server-related. @@ -258,52 +285,39 @@ class ApiClient: "internet_accessible": False, "api_accessible": False, "is_local_issue": False, - "is_api_issue": False + "is_api_issue": False, } + timeout = aiohttp.ClientTimeout(total=5.0) + async with aiohttp.ClientSession(timeout=timeout) as session: + try: + async with session.get("https://www.google.com", ssl=self.verify_ssl) as resp: + results["internet_accessible"] = resp.status < 500 + except (ClientError, asyncio.TimeoutError, socket.gaierror): + results["is_local_issue"] = True + return results # cannot reach the internet – early exit - # First check basic internet connectivity using a reliable external site - try: - # Use a reliable external domain for checking basic connectivity - check_response = requests.get("https://www.google.com", - timeout=5.0, - verify=self.verify_ssl) - if check_response.status_code < 500: - results["internet_accessible"] = True - except (requests.RequestException, socket.error): - results["internet_accessible"] = False - results["is_local_issue"] = True - return results - - # Now check API server connectivity - try: - # Extract domain from the target URL to do a simpler health check - parsed_url = urlparse(target_url) - api_base = f"{parsed_url.scheme}://{parsed_url.netloc}" - - # Try to reach the API domain - api_response = requests.get(f"{api_base}/health", timeout=5.0, verify=self.verify_ssl) - if api_response.status_code < 500: - results["api_accessible"] = True - else: - results["api_accessible"] = False - results["is_api_issue"] = True - except requests.RequestException: - results["api_accessible"] = False - # If we can reach the internet but not the API, it's an API issue - results["is_api_issue"] = True + # Now check API health endpoint + parsed = urlparse(target_url) + health_url = f"{parsed.scheme}://{parsed.netloc}/health" + try: + async with session.get(health_url, ssl=self.verify_ssl) as resp: + results["api_accessible"] = resp.status < 500 + except ClientError: + pass # leave as False + results["is_api_issue"] = results["internet_accessible"] and not results["api_accessible"] return results - def request( + async def request( self, method: str, path: str, params: Optional[Dict[str, Any]] = None, data: Optional[Dict[str, Any]] = None, - files: Optional[Dict[str, Any]] = None, + files: Optional[Dict[str, Any] | list[tuple[str, Any]]] = None, headers: Optional[Dict[str, str]] = None, content_type: str = "application/json", - multipart_parser: Callable = None, + multipart_parser: Callable | None = None, retry_count: int = 0, # Used internally for tracking retries ) -> Dict[str, Any]: """ @@ -327,18 +341,19 @@ class ApiClient: ApiServerError: If the API server is unreachable but internet is working Exception: For other request failures """ - # Use urljoin but ensure path is relative to avoid absolute path behavior - relative_path = path.lstrip('/') + + # Build full URL and merge headers + relative_path = path.lstrip("/") url = urljoin(self.base_url, relative_path) - self.check_auth(self.auth_token, self.comfy_api_key) - # Combine default headers with any provided headers + self._check_auth(self.auth_token, self.comfy_api_key) + request_headers = self.get_headers() if headers: request_headers.update(headers) - - # Let requests handle the content type when files are present. if files: - del request_headers["Content-Type"] + request_headers.pop("Content-Type", None) + if params: + params = {k: v for k, v in params.items() if v is not None} # aiohttp fails to serialize None values logging.debug(f"[DEBUG] Request Headers: {request_headers}") logging.debug(f"[DEBUG] Files: {files}") @@ -346,11 +361,9 @@ class ApiClient: logging.debug(f"[DEBUG] Data: {data}") if content_type == "application/x-www-form-urlencoded": - payload_args = self._create_urlencoded_form_data_args(data, request_headers) + payload_args = self._create_urlencoded_form_data_args(data or {}, request_headers) elif content_type == "multipart/form-data": - payload_args = self._create_form_data_args( - data, files, request_headers, multipart_parser - ) + payload_args = self._create_form_data_args(data, files, request_headers, multipart_parser) else: payload_args = self._create_json_payload_args(data, request_headers) @@ -361,220 +374,67 @@ class ApiClient: request_url=url, request_headers=request_headers, request_params=params, - request_data=data if content_type == "application/json" else "[form-data or other]" + request_data=data if content_type == "application/json" else "[form-data or other]", ) + session = await self._get_session() try: - response = requests.request( - method=method, - url=url, + async with session.request( + method, + url, params=params, - timeout=self.timeout, - verify=self.verify_ssl, + ssl=self.verify_ssl, **payload_args, - ) + ) as resp: + if resp.status >= 400: + try: + error_data = await resp.json() + except (aiohttp.ContentTypeError, json.JSONDecodeError): + error_data = await resp.text() - # Check if we should retry based on status code - if (response.status_code in self.retry_status_codes and - retry_count < self.max_retries): + return await self._handle_http_error( + ClientResponseError(resp.request_info, resp.history, status=resp.status, message=error_data), + operation_id, + method, + url, + params, + data, + files, + headers, + content_type, + multipart_parser, + retry_count=retry_count, + response_content=error_data, + ) - # Calculate delay with exponential backoff - delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) - - logging.warning( - f"Request failed with status {response.status_code}. " - f"Retrying in {delay:.2f}s ({retry_count + 1}/{self.max_retries})" - ) - - time.sleep(delay) - return self.request( - method=method, - path=path, - params=params, - data=data, - files=files, - headers=headers, - content_type=content_type, - multipart_parser=multipart_parser, - retry_count=retry_count + 1, - ) - - # Raise exception for error status codes - response.raise_for_status() - - # Log successful response - response_content_to_log = response.content - try: - # Attempt to parse JSON for prettier logging, fallback to raw content - response_content_to_log = response.json() - except json.JSONDecodeError: - pass # Keep as bytes/str if not JSON - - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, # Pass request details again for context in log - request_url=url, - response_status_code=response.status_code, - response_headers=dict(response.headers), - response_content=response_content_to_log - ) - - except requests.ConnectionError as e: - error_message = f"ConnectionError: {str(e)}" - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, - request_url=url, - error_message=error_message - ) - # Only perform connectivity check if we've exhausted all retries - if retry_count >= self.max_retries: - # Check connectivity to determine if it's a local or API issue - connectivity = self._check_connectivity(self.base_url) - - if connectivity["is_local_issue"]: - raise LocalNetworkError( - "Unable to connect to the API server due to local network issues. " - "Please check your internet connection and try again." - ) from e - elif connectivity["is_api_issue"]: - raise ApiServerError( - f"The API server at {self.base_url} is currently unreachable. " - f"The service may be experiencing issues. Please try again later." - ) from e - - # If we haven't exhausted retries yet, retry the request - if retry_count < self.max_retries: - delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) - logging.warning( - f"Connection error: {str(e)}. " - f"Retrying in {delay:.2f}s ({retry_count + 1}/{self.max_retries})" - ) - time.sleep(delay) - return self.request( - method=method, - path=path, - params=params, - data=data, - files=files, - headers=headers, - content_type=content_type, - multipart_parser=multipart_parser, - retry_count=retry_count + 1, - ) - - # If we've exhausted retries and didn't identify the specific issue, - # raise a generic exception - final_error_message = ( - f"Unable to connect to the API server after {self.max_retries} attempts. " - f"Please check your internet connection or try again later." - ) - request_logger.log_request_response( # Log final failure - operation_id=operation_id, - request_method=method, request_url=url, - error_message=final_error_message - ) - raise Exception(final_error_message) from e - - except requests.Timeout as e: - error_message = f"Timeout: {str(e)}" - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, request_url=url, - error_message=error_message - ) - # Retry timeouts if we haven't exhausted retries - if retry_count < self.max_retries: - delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) - logging.warning( - f"Request timed out. " - f"Retrying in {delay:.2f}s ({retry_count + 1}/{self.max_retries})" - ) - time.sleep(delay) - return self.request( - method=method, - path=path, - params=params, - data=data, - files=files, - headers=headers, - content_type=content_type, - multipart_parser=multipart_parser, - retry_count=retry_count + 1, - ) - final_error_message = ( - f"Request timed out after {self.timeout} seconds and {self.max_retries} retry attempts. " - f"The server might be experiencing high load or the operation is taking longer than expected." - ) - request_logger.log_request_response( # Log final failure - operation_id=operation_id, - request_method=method, request_url=url, - error_message=final_error_message - ) - raise Exception(final_error_message) from e - - except requests.HTTPError as e: - status_code = e.response.status_code if hasattr(e, "response") else None - original_error_message = f"HTTP Error: {str(e)}" - error_content_for_log = None - if hasattr(e, "response") and e.response is not None: - error_content_for_log = e.response.content + # Success – parse JSON (safely) and log try: - error_content_for_log = e.response.json() - except json.JSONDecodeError: - pass + payload = await resp.json() + response_content_to_log = payload + except (aiohttp.ContentTypeError, json.JSONDecodeError): + payload = {} + response_content_to_log = await resp.text() - - # Try to extract detailed error message from JSON response for user display - # but log the full error content. - user_display_error_message = original_error_message - - try: - if hasattr(e, "response") and e.response is not None and e.response.content: - error_json = e.response.json() - if "error" in error_json and "message" in error_json["error"]: - user_display_error_message = f"API Error: {error_json['error']['message']}" - if "type" in error_json["error"]: - user_display_error_message += f" (Type: {error_json['error']['type']})" - elif isinstance(error_json, dict): # Handle cases where error is just a JSON dict - user_display_error_message = f"API Error: {json.dumps(error_json)}" - else: # Non-dict JSON error - user_display_error_message = f"API Error: {str(error_json)}" - except json.JSONDecodeError: - # If not JSON, use the raw content if it's not too long, or a summary - if hasattr(e, "response") and e.response is not None and e.response.content: - raw_content = e.response.content.decode(errors='ignore') - if len(raw_content) < 200: # Arbitrary limit for display - user_display_error_message = f"API Error (raw): {raw_content}" - else: - user_display_error_message = f"API Error (raw, status {status_code})" - - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, request_url=url, - response_status_code=status_code, - response_headers=dict(e.response.headers) if hasattr(e, "response") and e.response is not None else None, - response_content=error_content_for_log, - error_message=original_error_message # Log the original exception string as error - ) - - logging.debug(f"[DEBUG] API Error: {user_display_error_message} (Status: {status_code})") - if hasattr(e, "response") and e.response is not None and e.response.content: - logging.debug(f"[DEBUG] Response content: {e.response.content}") - - # Retry if the status code is in our retry list and we haven't exhausted retries - if (status_code in self.retry_status_codes and - retry_count < self.max_retries): - - delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) - logging.warning( - f"HTTP error {status_code}. " - f"Retrying in {delay:.2f}s ({retry_count + 1}/{self.max_retries})" + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=response_content_to_log, ) - time.sleep(delay) - return self.request( - method=method, - path=path, + return payload + + except (ClientError, asyncio.TimeoutError, socket.gaierror) as e: + # Treat as *connection* problem – optionally retry, else escalate + if retry_count < self.max_retries: + delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) + logging.warning("Connection error. Retrying in %.2fs (%s/%s): %s", delay, retry_count + 1, + self.max_retries, str(e)) + await asyncio.sleep(delay) + return await self.request( + method, + path, params=params, data=data, files=files, @@ -583,40 +443,34 @@ class ApiClient: multipart_parser=multipart_parser, retry_count=retry_count + 1, ) + # One final connectivity check for diagnostics + connectivity = await self._check_connectivity(self.base_url) + if connectivity["is_local_issue"]: + raise LocalNetworkError( + "Unable to connect to the API server due to local network issues. " + "Please check your internet connection and try again." + ) from e + raise ApiServerError( + f"The API server at {self.base_url} is currently unreachable. " + f"The service may be experiencing issues. Please try again later." + ) from e - # Specific error messages for common status codes for user display - if status_code == 401: - user_display_error_message = "Unauthorized: Please login first to use this node." - elif status_code == 402: - user_display_error_message = "Payment Required: Please add credits to your account to use this node." - elif status_code == 409: - user_display_error_message = "There is a problem with your account. Please contact support@comfy.org." - elif status_code == 429: - user_display_error_message = "Rate Limit Exceeded: Please try again later." - # else, user_display_error_message remains as parsed from response or original HTTPError string - - raise Exception(user_display_error_message) # Raise with the user-friendly message - - # Parse and return JSON response - if response.content: - return response.json() - return {} - - def check_auth(self, auth_token, comfy_api_key): + @staticmethod + def _check_auth(auth_token, comfy_api_key): """Verify that an auth token is present or comfy_api_key is present""" if auth_token is None and comfy_api_key is None: raise Exception("Unauthorized: Please login first to use this node.") return auth_token or comfy_api_key @staticmethod - def upload_file( + async def upload_file( upload_url: str, file: io.BytesIO | str, content_type: str | None = None, max_retries: int = 3, retry_delay: float = 1.0, retry_backoff_factor: float = 2.0, - ): + ) -> aiohttp.ClientResponse: """Upload a file to the API with retry logic. Args: @@ -627,112 +481,167 @@ class ApiClient: retry_delay: Initial delay between retries in seconds retry_backoff_factor: Multiplier for the delay after each retry """ - headers = {} + headers: Dict[str, str] = {} + skip_auto_headers: set[str] = set() if content_type: headers["Content-Type"] = content_type + else: + # tell aiohttp not to add Content-Type that will break the request signature and result in a 403 status. + skip_auto_headers.add("Content-Type") - # Prepare the file data + # Extract file bytes if isinstance(file, io.BytesIO): - file.seek(0) # Ensure we're at the start of the file + file.seek(0) data = file.read() elif isinstance(file, str): with open(file, "rb") as f: data = f.read() else: - raise ValueError("File must be either a BytesIO object or a file path string") + raise ValueError("File must be BytesIO or str path") - # Try the upload with retries - last_exception = None - operation_id = f"upload_{upload_url.split('/')[-1]}_{uuid.uuid4().hex[:8]}" # Simplified ID for uploads - - # Log initial attempt (without full file data for brevity) + operation_id = f"upload_{upload_url.split('/')[-1]}_{uuid.uuid4().hex[:8]}" request_logger.log_request_response( operation_id=operation_id, request_method="PUT", request_url=upload_url, request_headers=headers, - request_data=f"[File data of type {content_type or 'unknown'}, size {len(data)} bytes]" + request_data=f"[File data {len(data)} bytes]", ) - for retry_attempt in range(max_retries + 1): + delay = retry_delay + for attempt in range(max_retries + 1): try: - response = requests.put(upload_url, data=data, headers=headers) - response.raise_for_status() + timeout = aiohttp.ClientTimeout(total=None) # honour server side timeouts + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.put( + upload_url, data=data, headers=headers, skip_auto_headers=skip_auto_headers, + ) as resp: + resp.raise_for_status() + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content="File uploaded successfully.", + ) + return resp + except (ClientError, asyncio.TimeoutError) as e: request_logger.log_request_response( operation_id=operation_id, - request_method="PUT", request_url=upload_url, # For context - response_status_code=response.status_code, - response_headers=dict(response.headers), - response_content="File uploaded successfully." # Or response.text if available + request_method="PUT", + request_url=upload_url, + response_status_code=e.status if hasattr(e, "status") else None, + response_headers=dict(e.headers) if getattr(e, "headers") else None, + response_content=None, + error_message=f"{type(e).__name__}: {str(e)}", ) - return response - - except (requests.ConnectionError, requests.Timeout, requests.HTTPError) as e: - last_exception = e - error_message_for_log = f"{type(e).__name__}: {str(e)}" - response_content_for_log = None - status_code_for_log = None - headers_for_log = None - - if hasattr(e, 'response') and e.response is not None: - status_code_for_log = e.response.status_code - headers_for_log = dict(e.response.headers) - try: - response_content_for_log = e.response.json() - except json.JSONDecodeError: - response_content_for_log = e.response.content - - - request_logger.log_request_response( - operation_id=operation_id, - request_method="PUT", request_url=upload_url, - response_status_code=status_code_for_log, - response_headers=headers_for_log, - response_content=response_content_for_log, - error_message=error_message_for_log - ) - - if retry_attempt < max_retries: - delay = retry_delay * (retry_backoff_factor ** retry_attempt) + if attempt < max_retries: logging.warning( - f"File upload failed: {str(e)}. " - f"Retrying in {delay:.2f}s ({retry_attempt + 1}/{max_retries})" + "Upload failed (%s/%s). Retrying in %.2fs. %s", attempt + 1, max_retries, delay, str(e) ) - time.sleep(delay) + await asyncio.sleep(delay) + delay *= retry_backoff_factor else: - break # Max retries reached + raise NetworkError(f"Failed to upload file after {max_retries + 1} attempts: {e}") from e - # If we've exhausted all retries, determine the final error type and raise - final_error_message = f"Failed to upload file after {max_retries + 1} attempts. Error: {str(last_exception)}" - try: - # Check basic internet connectivity - check_response = requests.get("https://www.google.com", timeout=5.0, verify=True) # Assuming verify=True is desired - if check_response.status_code >= 500: # Google itself has an issue (rare) - final_error_message = (f"Failed to upload file. Internet connectivity check to Google failed " - f"(status {check_response.status_code}). Original error: {str(last_exception)}") - # Not raising LocalNetworkError here as Google itself might be down. - # If Google is reachable, the issue is likely with the upload server or a more specific local problem - # not caught by a simple Google ping (e.g., DNS for the specific upload URL, firewall). - # The original last_exception is probably most relevant. + async def _handle_http_error( + self, + exc: ClientResponseError, + operation_id: str, + *req_meta, + retry_count: int, + response_content: dict | str = "", + ) -> Dict[str, Any]: + status_code = exc.status + if status_code == 401: + user_friendly = "Unauthorized: Please login first to use this node." + elif status_code == 402: + user_friendly = "Payment Required: Please add credits to your account to use this node." + elif status_code == 409: + user_friendly = "There is a problem with your account. Please contact support@comfy.org." + elif status_code == 429: + user_friendly = "Rate Limit Exceeded: Please try again later." + else: + if isinstance(response_content, dict): + if "error" in response_content and "message" in response_content["error"]: + user_friendly = f"API Error: {response_content['error']['message']}" + if "type" in response_content["error"]: + user_friendly += f" (Type: {response_content['error']['type']})" + else: # Handle cases where error is just a JSON dict with unknown format + user_friendly = f"API Error: {json.dumps(response_content)}" + else: + if len(response_content) < 200: # Arbitrary limit for display + user_friendly = f"API Error (raw): {response_content}" + else: + user_friendly = f"API Error (raw, status {response_content})" - except (requests.RequestException, socket.error) as conn_check_exc: - # Could not reach Google, likely a local network issue - final_error_message = (f"Failed to upload file due to network connectivity issues " - f"(cannot reach Google: {str(conn_check_exc)}). " - f"Original upload error: {str(last_exception)}") - request_logger.log_request_response( # Log final failure reason - operation_id=operation_id, - request_method="PUT", request_url=upload_url, - error_message=final_error_message - ) - raise LocalNetworkError(final_error_message) from last_exception - - request_logger.log_request_response( # Log final failure reason if not LocalNetworkError + request_logger.log_request_response( operation_id=operation_id, - request_method="PUT", request_url=upload_url, - error_message=final_error_message + request_method=req_meta[0], + request_url=req_meta[1], + response_status_code=exc.status, + response_headers=dict(req_meta[5]) if req_meta[5] else None, + response_content=response_content, + error_message=f"HTTP Error {exc.status}", ) - raise Exception(final_error_message) from last_exception + + logging.debug(f"[DEBUG] API Error: {user_friendly} (Status: {status_code})") + if response_content: + logging.debug(f"[DEBUG] Response content: {response_content}") + + # Retry if eligible + if status_code in self.retry_status_codes and retry_count < self.max_retries: + delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) + logging.warning( + "HTTP error %s. Retrying in %.2fs (%s/%s)", + status_code, + delay, + retry_count + 1, + self.max_retries, + ) + await asyncio.sleep(delay) + return await self.request( + req_meta[0], # method + req_meta[1].replace(self.base_url, ""), # path + params=req_meta[2], + data=req_meta[3], + files=req_meta[4], + headers=req_meta[5], + content_type=req_meta[6], + multipart_parser=req_meta[7], + retry_count=retry_count + 1, + ) + + raise Exception(user_friendly) from exc + + @staticmethod + def _unpack_tuple(t): + """Helper to normalise (filename, file, content_type) tuples.""" + if len(t) == 3: + return t + elif len(t) == 2: + return t[0], t[1], "application/octet-stream" + else: + raise ValueError("files tuple must be (filename, file[, content_type])") + + async def _get_session(self) -> aiohttp.ClientSession: + if self._session is None or self._session.closed: + timeout = aiohttp.ClientTimeout(total=self.timeout) + self._session = aiohttp.ClientSession(timeout=timeout) + self._owns_session = True + return self._session + + async def close(self) -> None: + if self._owns_session and self._session and not self._session.closed: + await self._session.close() + + async def __aenter__(self) -> "ApiClient": + """Allow usage as async‑context‑manager – ensures clean teardown""" + return self + + async def __aexit__(self, exc_type, exc, tb): + await self.close() class ApiEndpoint(Generic[T, R]): @@ -763,31 +672,28 @@ class ApiEndpoint(Generic[T, R]): class SynchronousOperation(Generic[T, R]): - """ - Represents a single synchronous API operation. - """ + """Represents a single synchronous API operation.""" def __init__( self, endpoint: ApiEndpoint[T, R], request: T, - files: Optional[Dict[str, Any]] = None, + files: Optional[Dict[str, Any] | list[tuple[str, Any]]] = None, api_base: str | None = None, auth_token: Optional[str] = None, comfy_api_key: Optional[str] = None, - auth_kwargs: Optional[Dict[str,str]] = None, + auth_kwargs: Optional[Dict[str, str]] = None, timeout: float = 604800.0, verify_ssl: bool = True, content_type: str = "application/json", - multipart_parser: Callable = None, + multipart_parser: Callable | None = None, max_retries: int = 3, retry_delay: float = 1.0, retry_backoff_factor: float = 2.0, - ): + ) -> None: self.endpoint = endpoint self.request = request - self.response = None - self.error = None + self.files = files self.api_base: str = api_base or args.comfy_api_base self.auth_token = auth_token self.comfy_api_key = comfy_api_key @@ -796,91 +702,64 @@ class SynchronousOperation(Generic[T, R]): self.comfy_api_key = auth_kwargs.get("comfy_api_key", self.comfy_api_key) self.timeout = timeout self.verify_ssl = verify_ssl - self.files = files self.content_type = content_type self.multipart_parser = multipart_parser self.max_retries = max_retries self.retry_delay = retry_delay self.retry_backoff_factor = retry_backoff_factor - def execute(self, client: Optional[ApiClient] = None) -> R: - """Execute the API operation using the provided client or create one with retry support""" - try: - # Create client if not provided - if client is None: - client = ApiClient( - base_url=self.api_base, - auth_token=self.auth_token, - comfy_api_key=self.comfy_api_key, - timeout=self.timeout, - verify_ssl=self.verify_ssl, - max_retries=self.max_retries, - retry_delay=self.retry_delay, - retry_backoff_factor=self.retry_backoff_factor, - ) - - # Convert request model to dict, but use None for EmptyRequest - request_dict = ( - None - if isinstance(self.request, EmptyRequest) - else self.request.model_dump(exclude_none=True) + async def execute(self, client: Optional[ApiClient] = None) -> R: + owns_client = client is None + if owns_client: + client = ApiClient( + base_url=self.api_base, + auth_token=self.auth_token, + comfy_api_key=self.comfy_api_key, + timeout=self.timeout, + verify_ssl=self.verify_ssl, + max_retries=self.max_retries, + retry_delay=self.retry_delay, + retry_backoff_factor=self.retry_backoff_factor, ) - if request_dict: - for key, value in request_dict.items(): - if isinstance(value, Enum): - request_dict[key] = value.value - # Debug log for request + try: + request_dict: Optional[Dict[str, Any]] + if isinstance(self.request, EmptyRequest): + request_dict = None + else: + request_dict = self.request.model_dump(exclude_none=True) + for k, v in list(request_dict.items()): + if isinstance(v, Enum): + request_dict[k] = v.value + logging.debug( f"[DEBUG] API Request: {self.endpoint.method.value} {self.endpoint.path}" ) logging.debug(f"[DEBUG] Request Data: {json.dumps(request_dict, indent=2)}") logging.debug(f"[DEBUG] Query Params: {self.endpoint.query_params}") - # Make the request with built-in retry - resp = client.request( - method=self.endpoint.method.value, - path=self.endpoint.path, - data=request_dict, + response_json = await client.request( + self.endpoint.method.value, + self.endpoint.path, params=self.endpoint.query_params, + data=request_dict, files=self.files, content_type=self.content_type, - multipart_parser=self.multipart_parser + multipart_parser=self.multipart_parser, ) - # Debug log for response logging.debug("=" * 50) logging.debug("[DEBUG] RESPONSE DETAILS:") logging.debug("[DEBUG] Status Code: 200 (Success)") - logging.debug(f"[DEBUG] Response Body: {json.dumps(resp, indent=2)}") + logging.debug(f"[DEBUG] Response Body: {json.dumps(response_json, indent=2)}") logging.debug("=" * 50) - # Parse and return the response - return self._parse_response(resp) - - except LocalNetworkError as e: - # Propagate specific network error types - logging.error(f"[ERROR] Local network error: {str(e)}") - raise - - except ApiServerError as e: - # Propagate API server errors - logging.error(f"[ERROR] API server error: {str(e)}") - raise - - except Exception as e: - logging.error(f"[ERROR] API Exception: {str(e)}") - raise Exception(str(e)) - - def _parse_response(self, resp): - """Parse response data - can be overridden by subclasses""" - # The response is already the complete object, don't extract just the "data" field - # as that would lose the outer structure (created timestamp, etc.) - - # Parse response using the provided model - self.response = self.endpoint.response_model.model_validate(resp) - logging.debug(f"[DEBUG] Parsed Response: {self.response}") - return self.response + parsed_response = self.endpoint.response_model.model_validate(response_json) + logging.debug(f"[DEBUG] Parsed Response: {parsed_response}") + return parsed_response + finally: + if owns_client: + await client.close() class TaskStatus(str, Enum): @@ -892,23 +771,21 @@ class TaskStatus(str, Enum): class PollingOperation(Generic[T, R]): - """ - Represents an asynchronous API operation that requires polling for completion. - """ + """Represents an asynchronous API operation that requires polling for completion.""" def __init__( self, poll_endpoint: ApiEndpoint[EmptyRequest, R], - completed_statuses: list, - failed_statuses: list, + completed_statuses: list[str], + failed_statuses: list[str], status_extractor: Callable[[R], str], - progress_extractor: Callable[[R], float] = None, - result_url_extractor: Callable[[R], str] = None, + progress_extractor: Callable[[R], float] | None = None, + result_url_extractor: Callable[[R], str] | None = None, request: Optional[T] = None, api_base: str | None = None, auth_token: Optional[str] = None, comfy_api_key: Optional[str] = None, - auth_kwargs: Optional[Dict[str,str]] = None, + auth_kwargs: Optional[Dict[str, str]] = None, poll_interval: float = 5.0, max_poll_attempts: int = 120, # Default max polling attempts (10 minutes with 5s interval) max_retries: int = 3, # Max retries per individual API call @@ -916,7 +793,7 @@ class PollingOperation(Generic[T, R]): retry_backoff_factor: float = 2.0, estimated_duration: Optional[float] = None, node_id: Optional[str] = None, - ): + ) -> None: self.poll_endpoint = poll_endpoint self.request = request self.api_base: str = api_base or args.comfy_api_base @@ -931,100 +808,73 @@ class PollingOperation(Generic[T, R]): self.retry_delay = retry_delay self.retry_backoff_factor = retry_backoff_factor self.estimated_duration = estimated_duration - - # Polling configuration - self.status_extractor = status_extractor or ( - lambda x: getattr(x, "status", None) - ) + self.status_extractor = status_extractor or (lambda x: getattr(x, "status", None)) self.progress_extractor = progress_extractor self.result_url_extractor = result_url_extractor self.node_id = node_id self.completed_statuses = completed_statuses self.failed_statuses = failed_statuses + self.final_response: Optional[R] = None - # For storing response data - self.final_response = None - self.error = None - - def execute(self, client: Optional[ApiClient] = None) -> R: - """Execute the polling operation using the provided client. If failed, raise an exception.""" + async def execute(self, client: Optional[ApiClient] = None) -> R: + owns_client = client is None + if owns_client: + client = ApiClient( + base_url=self.api_base, + auth_token=self.auth_token, + comfy_api_key=self.comfy_api_key, + max_retries=self.max_retries, + retry_delay=self.retry_delay, + retry_backoff_factor=self.retry_backoff_factor, + ) try: - if client is None: - client = ApiClient( - base_url=self.api_base, - auth_token=self.auth_token, - comfy_api_key=self.comfy_api_key, - max_retries=self.max_retries, - retry_delay=self.retry_delay, - retry_backoff_factor=self.retry_backoff_factor, - ) - return self._poll_until_complete(client) - except LocalNetworkError as e: - # Provide clear message for local network issues - raise Exception( - f"Polling failed due to local network issues. Please check your internet connection. " - f"Details: {str(e)}" - ) from e - except ApiServerError as e: - # Provide clear message for API server issues - raise Exception( - f"Polling failed due to API server issues. The service may be experiencing problems. " - f"Please try again later. Details: {str(e)}" - ) from e - except Exception as e: - raise Exception(f"Error during polling: {str(e)}") + return await self._poll_until_complete(client) + finally: + if owns_client: + await client.close() def _display_text_on_node(self, text: str): - """Sends text to the client which will be displayed on the node in the UI""" if not self.node_id: return - PromptServer.instance.send_progress_text(text, self.node_id) - def _display_time_progress_on_node(self, time_completed: int): + def _display_time_progress_on_node(self, time_completed: int | float): if not self.node_id: return - if self.estimated_duration is not None: - estimated_time_remaining = max( - 0, int(self.estimated_duration) - int(time_completed) - ) - message = f"Task in progress: {time_completed:.0f}s (~{estimated_time_remaining:.0f}s remaining)" + remaining = max(0, int(self.estimated_duration) - time_completed) + message = f"Task in progress: {time_completed}s (~{remaining}s remaining)" else: - message = f"Task in progress: {time_completed:.0f}s" + message = f"Task in progress: {time_completed}s" self._display_text_on_node(message) def _check_task_status(self, response: R) -> TaskStatus: - """Check task status using the status extractor function""" try: status = self.status_extractor(response) if status in self.completed_statuses: return TaskStatus.COMPLETED - elif status in self.failed_statuses: + if status in self.failed_statuses: return TaskStatus.FAILED return TaskStatus.PENDING except Exception as e: - logging.error(f"Error extracting status: {e}") + logging.error("Error extracting status: %s", e) return TaskStatus.PENDING - def _poll_until_complete(self, client: ApiClient) -> R: + async def _poll_until_complete(self, client: ApiClient) -> R: """Poll until the task is complete""" - poll_count = 0 consecutive_errors = 0 max_consecutive_errors = min(5, self.max_retries * 2) # Limit consecutive errors if self.progress_extractor: progress = utils.ProgressBar(PROGRESS_BAR_MAX) - while poll_count < self.max_poll_attempts: + status = TaskStatus.PENDING + for poll_count in range(1, self.max_poll_attempts + 1): try: - poll_count += 1 logging.debug(f"[DEBUG] Polling attempt #{poll_count}") request_dict = ( - self.request.model_dump(exclude_none=True) - if self.request is not None - else None + None if self.request is None else self.request.model_dump(exclude_none=True) ) if poll_count == 1: @@ -1036,18 +886,14 @@ class PollingOperation(Generic[T, R]): ) # Query task status - resp = client.request( - method=self.poll_endpoint.method.value, - path=self.poll_endpoint.path, + resp = await client.request( + self.poll_endpoint.method.value, + self.poll_endpoint.path, params=self.poll_endpoint.query_params, data=request_dict, ) - - # Successfully got a response, reset consecutive error count - consecutive_errors = 0 - - # Parse response - response_obj = self.poll_endpoint.response_model.model_validate(resp) + consecutive_errors = 0 # reset on success + response_obj: R = self.poll_endpoint.response_model.model_validate(resp) # Check if task is complete status = self._check_task_status(response_obj) @@ -1065,45 +911,30 @@ class PollingOperation(Generic[T, R]): result_url = self.result_url_extractor(response_obj) if result_url: message = f"Result URL: {result_url}" - else: - message = "Task completed successfully!" logging.debug(f"[DEBUG] {message}") self._display_text_on_node(message) self.final_response = response_obj if self.progress_extractor: progress.update(100) return self.final_response - elif status == TaskStatus.FAILED: + if status == TaskStatus.FAILED: message = f"Task failed: {json.dumps(resp)}" logging.error(f"[DEBUG] {message}") raise Exception(message) - else: - logging.debug("[DEBUG] Task still pending, continuing to poll...") - - # Wait before polling again - logging.debug( - f"[DEBUG] Waiting {self.poll_interval} seconds before next poll" - ) + logging.debug("[DEBUG] Task still pending, continuing to poll...") + # Task pending – wait for i in range(int(self.poll_interval)): - time_completed = (poll_count * self.poll_interval) + i - self._display_time_progress_on_node(time_completed) - time.sleep(1) + self._display_time_progress_on_node((poll_count - 1) * self.poll_interval + i) + await asyncio.sleep(1) - except (LocalNetworkError, ApiServerError) as e: - # For network-related errors, increment error count and potentially abort + except (LocalNetworkError, ApiServerError, NetworkError) as e: consecutive_errors += 1 if consecutive_errors >= max_consecutive_errors: raise Exception( - f"Polling aborted after {consecutive_errors} consecutive network errors: {str(e)}" + f"Polling aborted after {consecutive_errors} network errors: {str(e)}" ) from e - - # Log the error but continue polling - logging.warning( - f"Network error during polling (attempt {poll_count}/{self.max_poll_attempts}): {str(e)}. " - f"Will retry in {self.poll_interval} seconds." - ) - time.sleep(self.poll_interval) - + logging.warning("Network error (%s/%s): %s", consecutive_errors, max_consecutive_errors, str(e)) + await asyncio.sleep(self.poll_interval) except Exception as e: # For other errors, increment count and potentially abort consecutive_errors += 1 @@ -1117,10 +948,10 @@ class PollingOperation(Generic[T, R]): f"Error during polling (attempt {poll_count}/{self.max_poll_attempts}): {str(e)}. " f"Will retry in {self.poll_interval} seconds." ) - time.sleep(self.poll_interval) + await asyncio.sleep(self.poll_interval) # If we've exhausted all polling attempts raise Exception( - f"Polling timed out after {poll_count} attempts ({poll_count * self.poll_interval} seconds). " - f"The operation may still be running on the server but is taking longer than expected." + f"Polling timed out after {self.max_poll_attempts} attempts (" f"{self.max_poll_attempts * self.poll_interval} seconds). " + "The operation may still be running on the server but is taking longer than expected." ) diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py index d93fbd778..c09be8d5b 100644 --- a/comfy_api_nodes/nodes_bfl.py +++ b/comfy_api_nodes/nodes_bfl.py @@ -1,3 +1,4 @@ +import asyncio import io from inspect import cleandoc from typing import Union, Optional @@ -28,7 +29,7 @@ from comfy_api_nodes.apinode_utils import ( import numpy as np from PIL import Image -import requests +import aiohttp import torch import base64 import time @@ -44,18 +45,18 @@ def convert_mask_to_image(mask: torch.Tensor): return mask -def handle_bfl_synchronous_operation( +async def handle_bfl_synchronous_operation( operation: SynchronousOperation, timeout_bfl_calls=360, node_id: Union[str, None] = None, ): - response_api: BFLFluxProGenerateResponse = operation.execute() - return _poll_until_generated( + response_api: BFLFluxProGenerateResponse = await operation.execute() + return await _poll_until_generated( response_api.polling_url, timeout=timeout_bfl_calls, node_id=node_id ) -def _poll_until_generated( +async def _poll_until_generated( polling_url: str, timeout=360, node_id: Union[str, None] = None ): # used bfl-comfy-nodes to verify code implementation: @@ -66,55 +67,56 @@ def _poll_until_generated( retry_404_seconds = 2 retry_202_seconds = 2 retry_pending_seconds = 1 - request = requests.Request(method=HttpMethod.GET, url=polling_url) - # NOTE: should True loop be replaced with checking if workflow has been interrupted? - while True: - if node_id: - time_elapsed = time.time() - start_time - PromptServer.instance.send_progress_text( - f"Generating ({time_elapsed:.0f}s)", node_id - ) - response = requests.Session().send(request.prepare()) - if response.status_code == 200: - result = response.json() - if result["status"] == BFLStatus.ready: - img_url = result["result"]["sample"] - if node_id: - PromptServer.instance.send_progress_text( - f"Result URL: {img_url}", node_id - ) - img_response = requests.get(img_url) - return process_image_response(img_response) - elif result["status"] in [ - BFLStatus.request_moderated, - BFLStatus.content_moderated, - ]: - status = result["status"] - raise Exception( - f"BFL API did not return an image due to: {status}." + async with aiohttp.ClientSession() as session: + # NOTE: should True loop be replaced with checking if workflow has been interrupted? + while True: + if node_id: + time_elapsed = time.time() - start_time + PromptServer.instance.send_progress_text( + f"Generating ({time_elapsed:.0f}s)", node_id ) - elif result["status"] == BFLStatus.error: - raise Exception(f"BFL API encountered an error: {result}.") - elif result["status"] == BFLStatus.pending: - time.sleep(retry_pending_seconds) - continue - elif response.status_code == 404: - if retries_404 < max_retries_404: - retries_404 += 1 - time.sleep(retry_404_seconds) - continue - raise Exception( - f"BFL API could not find task after {max_retries_404} tries." - ) - elif response.status_code == 202: - time.sleep(retry_202_seconds) - elif time.time() - start_time > timeout: - raise Exception( - f"BFL API experienced a timeout; could not return request under {timeout} seconds." - ) - else: - raise Exception(f"BFL API encountered an error: {response.json()}") + + async with session.get(polling_url) as response: + if response.status == 200: + result = await response.json() + if result["status"] == BFLStatus.ready: + img_url = result["result"]["sample"] + if node_id: + PromptServer.instance.send_progress_text( + f"Result URL: {img_url}", node_id + ) + async with session.get(img_url) as img_resp: + return process_image_response(await img_resp.content.read()) + elif result["status"] in [ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + ]: + status = result["status"] + raise Exception( + f"BFL API did not return an image due to: {status}." + ) + elif result["status"] == BFLStatus.error: + raise Exception(f"BFL API encountered an error: {result}.") + elif result["status"] == BFLStatus.pending: + await asyncio.sleep(retry_pending_seconds) + continue + elif response.status == 404: + if retries_404 < max_retries_404: + retries_404 += 1 + await asyncio.sleep(retry_404_seconds) + continue + raise Exception( + f"BFL API could not find task after {max_retries_404} tries." + ) + elif response.status == 202: + await asyncio.sleep(retry_202_seconds) + elif time.time() - start_time > timeout: + raise Exception( + f"BFL API experienced a timeout; could not return request under {timeout} seconds." + ) + else: + raise Exception(f"BFL API encountered an error: {response.json()}") def convert_image_to_base64(image: torch.Tensor): scaled_image = downscale_image_tensor(image, total_pixels=2048 * 2048) @@ -222,7 +224,7 @@ class FluxProUltraImageNode(ComfyNodeABC): API_NODE = True CATEGORY = "api node/image/BFL" - def api_call( + async def api_call( self, prompt: str, aspect_ratio: str, @@ -266,7 +268,7 @@ class FluxProUltraImageNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) @@ -354,7 +356,7 @@ class FluxKontextProImageNode(ComfyNodeABC): BFL_PATH = "/proxy/bfl/flux-kontext-pro/generate" - def api_call( + async def api_call( self, prompt: str, aspect_ratio: str, @@ -397,7 +399,7 @@ class FluxKontextProImageNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) @@ -489,7 +491,7 @@ class FluxProImageNode(ComfyNodeABC): API_NODE = True CATEGORY = "api node/image/BFL" - def api_call( + async def api_call( self, prompt: str, prompt_upsampling, @@ -524,7 +526,7 @@ class FluxProImageNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) @@ -632,7 +634,7 @@ class FluxProExpandNode(ComfyNodeABC): API_NODE = True CATEGORY = "api node/image/BFL" - def api_call( + async def api_call( self, image: torch.Tensor, prompt: str, @@ -670,7 +672,7 @@ class FluxProExpandNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) @@ -744,7 +746,7 @@ class FluxProFillNode(ComfyNodeABC): API_NODE = True CATEGORY = "api node/image/BFL" - def api_call( + async def api_call( self, image: torch.Tensor, mask: torch.Tensor, @@ -780,7 +782,7 @@ class FluxProFillNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) @@ -879,7 +881,7 @@ class FluxProCannyNode(ComfyNodeABC): API_NODE = True CATEGORY = "api node/image/BFL" - def api_call( + async def api_call( self, control_image: torch.Tensor, prompt: str, @@ -929,7 +931,7 @@ class FluxProCannyNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) @@ -1008,7 +1010,7 @@ class FluxProDepthNode(ComfyNodeABC): API_NODE = True CATEGORY = "api node/image/BFL" - def api_call( + async def api_call( self, control_image: torch.Tensor, prompt: str, @@ -1045,7 +1047,7 @@ class FluxProDepthNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - output_image = handle_bfl_synchronous_operation(operation, node_id=unique_id) + output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) return (output_image,) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index af33279d5..3751fb2a1 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -303,7 +303,7 @@ class GeminiNode(ComfyNodeABC): """ return GeminiPart(text=text) - def api_call( + async def api_call( self, prompt: str, model: GeminiModel, @@ -332,7 +332,7 @@ class GeminiNode(ComfyNodeABC): parts.extend(files) # Create response - response = SynchronousOperation( + response = await SynchronousOperation( endpoint=get_gemini_endpoint(model), request=GeminiGenerateContentRequest( contents=[ diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index b8487355f..db24e6da4 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -212,7 +212,7 @@ V3_RESOLUTIONS= [ "1536x640" ] -def download_and_process_images(image_urls): +async def download_and_process_images(image_urls): """Helper function to download and process multiple images from URLs""" # Initialize list to store image tensors @@ -220,7 +220,7 @@ def download_and_process_images(image_urls): for image_url in image_urls: # Using functions from apinode_utils.py to handle downloading and processing - image_bytesio = download_url_to_bytesio(image_url) # Download image content to BytesIO + image_bytesio = await download_url_to_bytesio(image_url) # Download image content to BytesIO img_tensor = bytesio_to_image_tensor(image_bytesio, mode="RGB") # Convert to torch.Tensor with RGB mode image_tensors.append(img_tensor) @@ -328,7 +328,7 @@ class IdeogramV1(ComfyNodeABC): DESCRIPTION = cleandoc(__doc__ or "") API_NODE = True - def api_call( + async def api_call( self, prompt, turbo=False, @@ -367,7 +367,7 @@ class IdeogramV1(ComfyNodeABC): auth_kwargs=kwargs, ) - response = operation.execute() + response = await operation.execute() if not response.data or len(response.data) == 0: raise Exception("No images were generated in the response") @@ -378,7 +378,7 @@ class IdeogramV1(ComfyNodeABC): raise Exception("No image URLs were generated in the response") display_image_urls_on_node(image_urls, unique_id) - return (download_and_process_images(image_urls),) + return (await download_and_process_images(image_urls),) class IdeogramV2(ComfyNodeABC): @@ -487,7 +487,7 @@ class IdeogramV2(ComfyNodeABC): DESCRIPTION = cleandoc(__doc__ or "") API_NODE = True - def api_call( + async def api_call( self, prompt, turbo=False, @@ -543,7 +543,7 @@ class IdeogramV2(ComfyNodeABC): auth_kwargs=kwargs, ) - response = operation.execute() + response = await operation.execute() if not response.data or len(response.data) == 0: raise Exception("No images were generated in the response") @@ -554,7 +554,7 @@ class IdeogramV2(ComfyNodeABC): raise Exception("No image URLs were generated in the response") display_image_urls_on_node(image_urls, unique_id) - return (download_and_process_images(image_urls),) + return (await download_and_process_images(image_urls),) class IdeogramV3(ComfyNodeABC): """ @@ -653,7 +653,7 @@ class IdeogramV3(ComfyNodeABC): DESCRIPTION = cleandoc(__doc__ or "") API_NODE = True - def api_call( + async def api_call( self, prompt, image=None, @@ -774,7 +774,7 @@ class IdeogramV3(ComfyNodeABC): ) # Execute the operation and process response - response = operation.execute() + response = await operation.execute() if not response.data or len(response.data) == 0: raise Exception("No images were generated in the response") @@ -785,7 +785,7 @@ class IdeogramV3(ComfyNodeABC): raise Exception("No image URLs were generated in the response") display_image_urls_on_node(image_urls, unique_id) - return (download_and_process_images(image_urls),) + return (await download_and_process_images(image_urls),) NODE_CLASS_MAPPINGS = { diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 69e9e5cf0..9d9eb5628 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -109,7 +109,7 @@ class KlingApiError(Exception): pass -def poll_until_finished( +async def poll_until_finished( auth_kwargs: dict[str, str], api_endpoint: ApiEndpoint[Any, R], result_url_extractor: Optional[Callable[[R], str]] = None, @@ -117,7 +117,7 @@ def poll_until_finished( node_id: Optional[str] = None, ) -> R: """Polls the Kling API endpoint until the task reaches a terminal state, then returns the response.""" - return PollingOperation( + return await PollingOperation( poll_endpoint=api_endpoint, completed_statuses=[ KlingTaskStatus.succeed.value, @@ -278,18 +278,18 @@ def get_images_urls_from_response(response) -> Optional[str]: return None -def video_result_to_node_output( +async def video_result_to_node_output( video: KlingVideoResult, ) -> tuple[VideoFromFile, str, str]: """Converts a KlingVideoResult to a tuple of (VideoFromFile, str, str) to be used as a ComfyUI node output.""" return ( - download_url_to_video_output(video.url), + await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration), ) -def image_result_to_node_output( +async def image_result_to_node_output( images: list[KlingImageResult], ) -> torch.Tensor: """ @@ -297,9 +297,9 @@ def image_result_to_node_output( If multiple images are returned, they will be stacked along the batch dimension. """ if len(images) == 1: - return download_url_to_image_tensor(images[0].url) + return await download_url_to_image_tensor(str(images[0].url)) else: - return torch.cat([download_url_to_image_tensor(image.url) for image in images]) + return torch.cat([await download_url_to_image_tensor(str(image.url)) for image in images]) class KlingNodeBase(ComfyNodeABC): @@ -467,10 +467,10 @@ class KlingTextToVideoNode(KlingNodeBase): RETURN_NAMES = ("VIDEO", "video_id", "duration") DESCRIPTION = "Kling Text to Video Node" - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> KlingText2VideoResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_TEXT_TO_VIDEO}/{task_id}", @@ -483,7 +483,7 @@ class KlingTextToVideoNode(KlingNodeBase): node_id=node_id, ) - def api_call( + async def api_call( self, prompt: str, negative_prompt: str, @@ -519,17 +519,17 @@ class KlingTextToVideoNode(KlingNodeBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_video_result_response(final_response) video = get_video_from_response(final_response) - return video_result_to_node_output(video) + return await video_result_to_node_output(video) class KlingCameraControlT2VNode(KlingTextToVideoNode): @@ -581,7 +581,7 @@ class KlingCameraControlT2VNode(KlingTextToVideoNode): DESCRIPTION = "Transform text into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original text." - def api_call( + async def api_call( self, prompt: str, negative_prompt: str, @@ -591,7 +591,7 @@ class KlingCameraControlT2VNode(KlingTextToVideoNode): unique_id: Optional[str] = None, **kwargs, ): - return super().api_call( + return await super().api_call( model_name=KlingVideoGenModelName.kling_v1, cfg_scale=cfg_scale, mode=KlingVideoGenMode.std, @@ -670,10 +670,10 @@ class KlingImage2VideoNode(KlingNodeBase): RETURN_NAMES = ("VIDEO", "video_id", "duration") DESCRIPTION = "Kling Image to Video Node" - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> KlingImage2VideoResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}", @@ -686,7 +686,7 @@ class KlingImage2VideoNode(KlingNodeBase): node_id=node_id, ) - def api_call( + async def api_call( self, start_frame: torch.Tensor, prompt: str, @@ -733,17 +733,17 @@ class KlingImage2VideoNode(KlingNodeBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_video_result_response(final_response) video = get_video_from_response(final_response) - return video_result_to_node_output(video) + return await video_result_to_node_output(video) class KlingCameraControlI2VNode(KlingImage2VideoNode): @@ -798,7 +798,7 @@ class KlingCameraControlI2VNode(KlingImage2VideoNode): DESCRIPTION = "Transform still images into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original image." - def api_call( + async def api_call( self, start_frame: torch.Tensor, prompt: str, @@ -809,7 +809,7 @@ class KlingCameraControlI2VNode(KlingImage2VideoNode): unique_id: Optional[str] = None, **kwargs, ): - return super().api_call( + return await super().api_call( model_name=KlingVideoGenModelName.kling_v1_5, start_frame=start_frame, cfg_scale=cfg_scale, @@ -897,7 +897,7 @@ class KlingStartEndFrameNode(KlingImage2VideoNode): DESCRIPTION = "Generate a video sequence that transitions between your provided start and end images. The node creates all frames in between, producing a smooth transformation from the first frame to the last." - def api_call( + async def api_call( self, start_frame: torch.Tensor, end_frame: torch.Tensor, @@ -912,7 +912,7 @@ class KlingStartEndFrameNode(KlingImage2VideoNode): mode, duration, model_name = KlingStartEndFrameNode.get_mode_string_mapping()[ mode ] - return super().api_call( + return await super().api_call( prompt=prompt, negative_prompt=negative_prompt, model_name=model_name, @@ -964,10 +964,10 @@ class KlingVideoExtendNode(KlingNodeBase): RETURN_NAMES = ("VIDEO", "video_id", "duration") DESCRIPTION = "Kling Video Extend Node. Extend videos made by other Kling nodes. The video_id is created by using other Kling Nodes." - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> KlingVideoExtendResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_VIDEO_EXTEND}/{task_id}", @@ -980,7 +980,7 @@ class KlingVideoExtendNode(KlingNodeBase): node_id=node_id, ) - def api_call( + async def api_call( self, prompt: str, negative_prompt: str, @@ -1006,17 +1006,17 @@ class KlingVideoExtendNode(KlingNodeBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_video_result_response(final_response) video = get_video_from_response(final_response) - return video_result_to_node_output(video) + return await video_result_to_node_output(video) class KlingVideoEffectsBase(KlingNodeBase): @@ -1025,10 +1025,10 @@ class KlingVideoEffectsBase(KlingNodeBase): RETURN_TYPES = ("VIDEO", "STRING", "STRING") RETURN_NAMES = ("VIDEO", "video_id", "duration") - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> KlingVideoEffectsResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_VIDEO_EFFECTS}/{task_id}", @@ -1041,7 +1041,7 @@ class KlingVideoEffectsBase(KlingNodeBase): node_id=node_id, ) - def api_call( + async def api_call( self, dual_character: bool, effect_scene: KlingDualCharacterEffectsScene | KlingSingleImageEffectsScene, @@ -1084,17 +1084,17 @@ class KlingVideoEffectsBase(KlingNodeBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_video_result_response(final_response) video = get_video_from_response(final_response) - return video_result_to_node_output(video) + return await video_result_to_node_output(video) class KlingDualCharacterVideoEffectNode(KlingVideoEffectsBase): @@ -1142,7 +1142,7 @@ class KlingDualCharacterVideoEffectNode(KlingVideoEffectsBase): RETURN_TYPES = ("VIDEO", "STRING") RETURN_NAMES = ("VIDEO", "duration") - def api_call( + async def api_call( self, image_left: torch.Tensor, image_right: torch.Tensor, @@ -1153,7 +1153,7 @@ class KlingDualCharacterVideoEffectNode(KlingVideoEffectsBase): unique_id: Optional[str] = None, **kwargs, ): - video, _, duration = super().api_call( + video, _, duration = await super().api_call( dual_character=True, effect_scene=effect_scene, model_name=model_name, @@ -1208,7 +1208,7 @@ class KlingSingleImageVideoEffectNode(KlingVideoEffectsBase): DESCRIPTION = "Achieve different special effects when generating a video based on the effect_scene." - def api_call( + async def api_call( self, image: torch.Tensor, effect_scene: KlingSingleImageEffectsScene, @@ -1217,7 +1217,7 @@ class KlingSingleImageVideoEffectNode(KlingVideoEffectsBase): unique_id: Optional[str] = None, **kwargs, ): - return super().api_call( + return await super().api_call( dual_character=False, effect_scene=effect_scene, model_name=model_name, @@ -1253,11 +1253,11 @@ class KlingLipSyncBase(KlingNodeBase): f"Text is too long. Maximum length is {MAX_PROMPT_LENGTH_LIP_SYNC} characters." ) - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> KlingLipSyncResponse: """Polls the Kling API endpoint until the task reaches a terminal state.""" - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_LIP_SYNC}/{task_id}", @@ -1270,7 +1270,7 @@ class KlingLipSyncBase(KlingNodeBase): node_id=node_id, ) - def api_call( + async def api_call( self, video: VideoInput, audio: Optional[AudioInput] = None, @@ -1287,12 +1287,12 @@ class KlingLipSyncBase(KlingNodeBase): self.validate_lip_sync_video(video) # Upload video to Comfy API and get download URL - video_url = upload_video_to_comfyapi(video, auth_kwargs=kwargs) + video_url = await upload_video_to_comfyapi(video, auth_kwargs=kwargs) logging.info("Uploaded video to Comfy API. URL: %s", video_url) # Upload the audio file to Comfy API and get download URL if audio: - audio_url = upload_audio_to_comfyapi(audio, auth_kwargs=kwargs) + audio_url = await upload_audio_to_comfyapi(audio, auth_kwargs=kwargs) logging.info("Uploaded audio to Comfy API. URL: %s", audio_url) else: audio_url = None @@ -1319,17 +1319,17 @@ class KlingLipSyncBase(KlingNodeBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_video_result_response(final_response) video = get_video_from_response(final_response) - return video_result_to_node_output(video) + return await video_result_to_node_output(video) class KlingLipSyncAudioToVideoNode(KlingLipSyncBase): @@ -1357,7 +1357,7 @@ class KlingLipSyncAudioToVideoNode(KlingLipSyncBase): DESCRIPTION = "Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file. When using, ensure that the audio contains clearly distinguishable vocals and that the video contains a distinct face. The audio file should not be larger than 5MB. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length." - def api_call( + async def api_call( self, video: VideoInput, audio: AudioInput, @@ -1365,7 +1365,7 @@ class KlingLipSyncAudioToVideoNode(KlingLipSyncBase): unique_id: Optional[str] = None, **kwargs, ): - return super().api_call( + return await super().api_call( video=video, audio=audio, voice_language=voice_language, @@ -1469,7 +1469,7 @@ class KlingLipSyncTextToVideoNode(KlingLipSyncBase): DESCRIPTION = "Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length." - def api_call( + async def api_call( self, video: VideoInput, text: str, @@ -1479,7 +1479,7 @@ class KlingLipSyncTextToVideoNode(KlingLipSyncBase): **kwargs, ): voice_id, voice_language = KlingLipSyncTextToVideoNode.get_voice_config()[voice] - return super().api_call( + return await super().api_call( video=video, text=text, voice_language=voice_language, @@ -1533,10 +1533,10 @@ class KlingVirtualTryOnNode(KlingImageGenerationBase): DESCRIPTION = "Kling Virtual Try On Node. Input a human image and a cloth image to try on the cloth on the human. You can merge multiple clothing item pictures into one image with a white background." - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> KlingVirtualTryOnResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_VIRTUAL_TRY_ON}/{task_id}", @@ -1549,7 +1549,7 @@ class KlingVirtualTryOnNode(KlingImageGenerationBase): node_id=node_id, ) - def api_call( + async def api_call( self, human_image: torch.Tensor, cloth_image: torch.Tensor, @@ -1572,17 +1572,17 @@ class KlingVirtualTryOnNode(KlingImageGenerationBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_image_result_response(final_response) images = get_images_from_response(final_response) - return (image_result_to_node_output(images),) + return (await image_result_to_node_output(images),) class KlingImageGenerationNode(KlingImageGenerationBase): @@ -1655,13 +1655,13 @@ class KlingImageGenerationNode(KlingImageGenerationBase): DESCRIPTION = "Kling Image Generation Node. Generate an image from a text prompt with an optional reference image." - def get_response( + async def get_response( self, task_id: str, auth_kwargs: Optional[dict[str, str]], node_id: Optional[str] = None, ) -> KlingImageGenerationsResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_IMAGE_GENERATIONS}/{task_id}", @@ -1674,7 +1674,7 @@ class KlingImageGenerationNode(KlingImageGenerationBase): node_id=node_id, ) - def api_call( + async def api_call( self, model_name: KlingImageGenModelName, prompt: str, @@ -1714,17 +1714,17 @@ class KlingImageGenerationNode(KlingImageGenerationBase): auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) validate_image_result_response(final_response) images = get_images_from_response(final_response) - return (image_result_to_node_output(images),) + return (await image_result_to_node_output(images),) NODE_CLASS_MAPPINGS = { diff --git a/comfy_api_nodes/nodes_luma.py b/comfy_api_nodes/nodes_luma.py index 525dc38e6..b3c32bed5 100644 --- a/comfy_api_nodes/nodes_luma.py +++ b/comfy_api_nodes/nodes_luma.py @@ -38,7 +38,7 @@ from comfy_api_nodes.apinode_utils import ( ) from server import PromptServer -import requests +import aiohttp import torch from io import BytesIO @@ -217,7 +217,7 @@ class LumaImageGenerationNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, prompt: str, model: str, @@ -234,19 +234,19 @@ class LumaImageGenerationNode(ComfyNodeABC): # handle image_luma_ref api_image_ref = None if image_luma_ref is not None: - api_image_ref = self._convert_luma_refs( + api_image_ref = await self._convert_luma_refs( image_luma_ref, max_refs=4, auth_kwargs=kwargs, ) # handle style_luma_ref api_style_ref = None if style_image is not None: - api_style_ref = self._convert_style_image( + api_style_ref = await self._convert_style_image( style_image, weight=style_image_weight, auth_kwargs=kwargs, ) # handle character_ref images character_ref = None if character_image is not None: - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( character_image, max_images=4, auth_kwargs=kwargs, ) character_ref = LumaCharacterRef( @@ -270,7 +270,7 @@ class LumaImageGenerationNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api: LumaGeneration = operation.execute() + response_api: LumaGeneration = await operation.execute() operation = PollingOperation( poll_endpoint=ApiEndpoint( @@ -286,19 +286,20 @@ class LumaImageGenerationNode(ComfyNodeABC): node_id=unique_id, auth_kwargs=kwargs, ) - response_poll = operation.execute() + response_poll = await operation.execute() - img_response = requests.get(response_poll.assets.image) - img = process_image_response(img_response) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.assets.image) as img_response: + img = process_image_response(await img_response.content.read()) return (img,) - def _convert_luma_refs( + async def _convert_luma_refs( self, luma_ref: LumaReferenceChain, max_refs: int, auth_kwargs: Optional[dict[str,str]] = None ): luma_urls = [] ref_count = 0 for ref in luma_ref.refs: - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( ref.image, max_images=1, auth_kwargs=auth_kwargs ) luma_urls.append(download_urls[0]) @@ -307,13 +308,13 @@ class LumaImageGenerationNode(ComfyNodeABC): break return luma_ref.create_api_model(download_urls=luma_urls, max_refs=max_refs) - def _convert_style_image( + async def _convert_style_image( self, style_image: torch.Tensor, weight: float, auth_kwargs: Optional[dict[str,str]] = None ): chain = LumaReferenceChain( first_ref=LumaReference(image=style_image, weight=weight) ) - return self._convert_luma_refs(chain, max_refs=1, auth_kwargs=auth_kwargs) + return await self._convert_luma_refs(chain, max_refs=1, auth_kwargs=auth_kwargs) class LumaImageModifyNode(ComfyNodeABC): @@ -370,7 +371,7 @@ class LumaImageModifyNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, prompt: str, model: str, @@ -381,7 +382,7 @@ class LumaImageModifyNode(ComfyNodeABC): **kwargs, ): # first, upload image - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( image, max_images=1, auth_kwargs=kwargs, ) image_url = download_urls[0] @@ -402,7 +403,7 @@ class LumaImageModifyNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api: LumaGeneration = operation.execute() + response_api: LumaGeneration = await operation.execute() operation = PollingOperation( poll_endpoint=ApiEndpoint( @@ -418,10 +419,11 @@ class LumaImageModifyNode(ComfyNodeABC): node_id=unique_id, auth_kwargs=kwargs, ) - response_poll = operation.execute() + response_poll = await operation.execute() - img_response = requests.get(response_poll.assets.image) - img = process_image_response(img_response) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.assets.image) as img_response: + img = process_image_response(await img_response.content.read()) return (img,) @@ -494,7 +496,7 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, prompt: str, model: str, @@ -529,7 +531,7 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api: LumaGeneration = operation.execute() + response_api: LumaGeneration = await operation.execute() if unique_id: PromptServer.instance.send_progress_text(f"Luma video generation started: {response_api.id}", unique_id) @@ -549,10 +551,11 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC): estimated_duration=LUMA_T2V_AVERAGE_DURATION, auth_kwargs=kwargs, ) - response_poll = operation.execute() + response_poll = await operation.execute() - vid_response = requests.get(response_poll.assets.video) - return (VideoFromFile(BytesIO(vid_response.content)),) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.assets.video) as vid_response: + return (VideoFromFile(BytesIO(await vid_response.content.read())),) class LumaImageToVideoGenerationNode(ComfyNodeABC): @@ -626,7 +629,7 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, prompt: str, model: str, @@ -644,7 +647,7 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): raise Exception( "At least one of first_image and last_image requires an input." ) - keyframes = self._convert_to_keyframes(first_image, last_image, auth_kwargs=kwargs) + keyframes = await self._convert_to_keyframes(first_image, last_image, auth_kwargs=kwargs) duration = duration if model != LumaVideoModel.ray_1_6 else None resolution = resolution if model != LumaVideoModel.ray_1_6 else None @@ -667,7 +670,7 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api: LumaGeneration = operation.execute() + response_api: LumaGeneration = await operation.execute() if unique_id: PromptServer.instance.send_progress_text(f"Luma video generation started: {response_api.id}", unique_id) @@ -687,12 +690,13 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): estimated_duration=LUMA_I2V_AVERAGE_DURATION, auth_kwargs=kwargs, ) - response_poll = operation.execute() + response_poll = await operation.execute() - vid_response = requests.get(response_poll.assets.video) - return (VideoFromFile(BytesIO(vid_response.content)),) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.assets.video) as vid_response: + return (VideoFromFile(BytesIO(await vid_response.content.read())),) - def _convert_to_keyframes( + async def _convert_to_keyframes( self, first_image: torch.Tensor = None, last_image: torch.Tensor = None, @@ -703,12 +707,12 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): frame0 = None frame1 = None if first_image is not None: - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( first_image, max_images=1, auth_kwargs=auth_kwargs, ) frame0 = LumaImageReference(type="image", url=download_urls[0]) if last_image is not None: - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( last_image, max_images=1, auth_kwargs=auth_kwargs, ) frame1 = LumaImageReference(type="image", url=download_urls[0]) diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index 9b46636db..58d2ed90c 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -86,7 +86,7 @@ class MinimaxTextToVideoNode: API_NODE = True OUTPUT_NODE = True - def generate_video( + async def generate_video( self, prompt_text, seed=0, @@ -104,12 +104,12 @@ class MinimaxTextToVideoNode: # upload image, if passed in image_url = None if image is not None: - image_url = upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs)[0] + image_url = (await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs))[0] # TODO: figure out how to deal with subject properly, API returns invalid params when using S2V-01 model subject_reference = None if subject is not None: - subject_url = upload_images_to_comfyapi(subject, max_images=1, auth_kwargs=kwargs)[0] + subject_url = (await upload_images_to_comfyapi(subject, max_images=1, auth_kwargs=kwargs))[0] subject_reference = [SubjectReferenceItem(image=subject_url)] @@ -130,7 +130,7 @@ class MinimaxTextToVideoNode: ), auth_kwargs=kwargs, ) - response = video_generate_operation.execute() + response = await video_generate_operation.execute() task_id = response.task_id if not task_id: @@ -151,7 +151,7 @@ class MinimaxTextToVideoNode: node_id=unique_id, auth_kwargs=kwargs, ) - task_result = video_generate_operation.execute() + task_result = await video_generate_operation.execute() file_id = task_result.file_id if file_id is None: @@ -167,7 +167,7 @@ class MinimaxTextToVideoNode: request=EmptyRequest(), auth_kwargs=kwargs, ) - file_result = file_retrieve_operation.execute() + file_result = await file_retrieve_operation.execute() file_url = file_result.file.download_url if file_url is None: @@ -182,7 +182,7 @@ class MinimaxTextToVideoNode: message = f"Result URL: {file_url}" PromptServer.instance.send_progress_text(message, unique_id) - video_io = download_url_to_bytesio(file_url) + video_io = await download_url_to_bytesio(file_url) if video_io is None: error_msg = f"Failed to download video from {file_url}" logging.error(error_msg) diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 789fcef02..164ca3ea5 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -95,14 +95,14 @@ def get_video_url_from_response(response) -> Optional[str]: return None -def poll_until_finished( +async def poll_until_finished( auth_kwargs: dict[str, str], api_endpoint: ApiEndpoint[Any, R], result_url_extractor: Optional[Callable[[R], str]] = None, node_id: Optional[str] = None, ) -> R: """Polls the Moonvalley API endpoint until the task reaches a terminal state, then returns the response.""" - return PollingOperation( + return await PollingOperation( poll_endpoint=api_endpoint, completed_statuses=[ "completed", @@ -394,10 +394,10 @@ class BaseMoonvalleyVideoNode: else: return control_map["Motion Transfer"] - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> MoonvalleyPromptResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{API_PROMPTS_ENDPOINT}/{task_id}", @@ -507,7 +507,7 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): RETURN_NAMES = ("video",) DESCRIPTION = "Moonvalley Marey Image to Video Node" - def generate( + async def generate( self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs ): image = kwargs.get("image", None) @@ -532,9 +532,9 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): # Get MIME type from tensor - assuming PNG format for image tensors mime_type = "image/png" - image_url = upload_images_to_comfyapi( + image_url = (await upload_images_to_comfyapi( image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type - )[0] + ))[0] request = MoonvalleyTextToVideoRequest( image_url=image_url, prompt_text=prompt, inference_params=inference_params @@ -549,14 +549,14 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): request=request, auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) - video = download_url_to_video_output(final_response.output_url) + video = await download_url_to_video_output(final_response.output_url) return (video,) @@ -609,7 +609,7 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): RETURN_TYPES = ("VIDEO",) RETURN_NAMES = ("video",) - def generate( + async def generate( self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs ): video = kwargs.get("video") @@ -620,7 +620,7 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): video_url = "" if video: validated_video = validate_video_to_video_input(video) - video_url = upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs) + video_url = await upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs) control_type = kwargs.get("control_type") motion_intensity = kwargs.get("motion_intensity") @@ -658,15 +658,15 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): request=request, auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) - video = download_url_to_video_output(final_response.output_url) + video = await download_url_to_video_output(final_response.output_url) return (video,) @@ -688,7 +688,7 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): del input_types["optional"][param] return input_types - def generate( + async def generate( self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs ): validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) @@ -717,15 +717,15 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): request=request, auth_kwargs=kwargs, ) - task_creation_response = initial_operation.execute() + task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.id - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) - video = download_url_to_video_output(final_response.output_url) + video = await download_url_to_video_output(final_response.output_url) return (video,) diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index be1d2de4a..ab3c5363b 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -163,7 +163,7 @@ class OpenAIDalle2(ComfyNodeABC): DESCRIPTION = cleandoc(__doc__ or "") API_NODE = True - def api_call( + async def api_call( self, prompt, seed=0, @@ -233,9 +233,9 @@ class OpenAIDalle2(ComfyNodeABC): auth_kwargs=kwargs, ) - response = operation.execute() + response = await operation.execute() - img_tensor = validate_and_cast_response(response, node_id=unique_id) + img_tensor = await validate_and_cast_response(response, node_id=unique_id) return (img_tensor,) @@ -311,7 +311,7 @@ class OpenAIDalle3(ComfyNodeABC): DESCRIPTION = cleandoc(__doc__ or "") API_NODE = True - def api_call( + async def api_call( self, prompt, seed=0, @@ -343,9 +343,9 @@ class OpenAIDalle3(ComfyNodeABC): auth_kwargs=kwargs, ) - response = operation.execute() + response = await operation.execute() - img_tensor = validate_and_cast_response(response, node_id=unique_id) + img_tensor = await validate_and_cast_response(response, node_id=unique_id) return (img_tensor,) @@ -446,7 +446,7 @@ class OpenAIGPTImage1(ComfyNodeABC): DESCRIPTION = cleandoc(__doc__ or "") API_NODE = True - def api_call( + async def api_call( self, prompt, seed=0, @@ -537,9 +537,9 @@ class OpenAIGPTImage1(ComfyNodeABC): auth_kwargs=kwargs, ) - response = operation.execute() + response = await operation.execute() - img_tensor = validate_and_cast_response(response, node_id=unique_id) + img_tensor = await validate_and_cast_response(response, node_id=unique_id) return (img_tensor,) @@ -623,7 +623,7 @@ class OpenAIChatNode(OpenAITextNode): DESCRIPTION = "Generate text responses from an OpenAI model." - def get_result_response( + async def get_result_response( self, response_id: str, include: Optional[list[Includable]] = None, @@ -639,7 +639,7 @@ class OpenAIChatNode(OpenAITextNode): creation above for more information. """ - return PollingOperation( + return await PollingOperation( poll_endpoint=ApiEndpoint( path=f"{RESPONSES_ENDPOINT}/{response_id}", method=HttpMethod.GET, @@ -784,7 +784,7 @@ class OpenAIChatNode(OpenAITextNode): self.history[session_id] = new_history - def api_call( + async def api_call( self, prompt: str, persist_context: bool, @@ -815,7 +815,7 @@ class OpenAIChatNode(OpenAITextNode): previous_response_id = None # Create response - create_response = SynchronousOperation( + create_response = await SynchronousOperation( endpoint=ApiEndpoint( path=RESPONSES_ENDPOINT, method=HttpMethod.POST, @@ -848,7 +848,7 @@ class OpenAIChatNode(OpenAITextNode): response_id = create_response.id # Get result output - result_response = self.get_result_response(response_id, auth_kwargs=kwargs) + result_response = await self.get_result_response(response_id, auth_kwargs=kwargs) output_text = self.parse_output_text_from_response(result_response) # Update history diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py index 1cc708564..a8dc43cb3 100644 --- a/comfy_api_nodes/nodes_pika.py +++ b/comfy_api_nodes/nodes_pika.py @@ -122,7 +122,7 @@ class PikaNodeBase(ComfyNodeABC): FUNCTION = "api_call" RETURN_TYPES = ("VIDEO",) - def poll_for_task_status( + async def poll_for_task_status( self, task_id: str, auth_kwargs: Optional[dict[str, str]] = None, @@ -152,9 +152,9 @@ class PikaNodeBase(ComfyNodeABC): node_id=node_id, estimated_duration=60 ) - return polling_operation.execute() + return await polling_operation.execute() - def execute_task( + async def execute_task( self, initial_operation: SynchronousOperation[R, PikaGenerateResponse], auth_kwargs: Optional[dict[str, str]] = None, @@ -169,14 +169,14 @@ class PikaNodeBase(ComfyNodeABC): Returns: A tuple containing the video file as a VIDEO output. """ - initial_response = initial_operation.execute() + initial_response = await initial_operation.execute() if not is_valid_initial_response(initial_response): error_msg = f"Pika initial request failed. Code: {initial_response.code}, Message: {initial_response.message}, Data: {initial_response.data}" logging.error(error_msg) raise PikaApiError(error_msg) task_id = initial_response.video_id - final_response = self.poll_for_task_status(task_id, auth_kwargs) + final_response = await self.poll_for_task_status(task_id, auth_kwargs) if not is_valid_video_response(final_response): error_msg = ( f"Pika task {task_id} succeeded but no video data found in response." @@ -187,7 +187,7 @@ class PikaNodeBase(ComfyNodeABC): video_url = str(final_response.url) logging.info("Pika task %s succeeded. Video URL: %s", task_id, video_url) - return (download_url_to_video_output(video_url),) + return (await download_url_to_video_output(video_url),) class PikaImageToVideoV2_2(PikaNodeBase): @@ -212,7 +212,7 @@ class PikaImageToVideoV2_2(PikaNodeBase): DESCRIPTION = "Sends an image and prompt to the Pika API v2.2 to generate a video." - def api_call( + async def api_call( self, image: torch.Tensor, prompt_text: str, @@ -251,7 +251,7 @@ class PikaImageToVideoV2_2(PikaNodeBase): auth_kwargs=kwargs, ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) class PikaTextToVideoNodeV2_2(PikaNodeBase): @@ -281,7 +281,7 @@ class PikaTextToVideoNodeV2_2(PikaNodeBase): DESCRIPTION = "Sends a text prompt to the Pika API v2.2 to generate a video." - def api_call( + async def api_call( self, prompt_text: str, negative_prompt: str, @@ -311,7 +311,7 @@ class PikaTextToVideoNodeV2_2(PikaNodeBase): content_type="application/x-www-form-urlencoded", ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) class PikaScenesV2_2(PikaNodeBase): @@ -361,7 +361,7 @@ class PikaScenesV2_2(PikaNodeBase): DESCRIPTION = "Combine your images to create a video with the objects in them. Upload multiple images as ingredients and generate a high-quality video that incorporates all of them." - def api_call( + async def api_call( self, prompt_text: str, negative_prompt: str, @@ -420,7 +420,7 @@ class PikaScenesV2_2(PikaNodeBase): auth_kwargs=kwargs, ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) class PikAdditionsNode(PikaNodeBase): @@ -462,7 +462,7 @@ class PikAdditionsNode(PikaNodeBase): DESCRIPTION = "Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result." - def api_call( + async def api_call( self, video: VideoInput, image: torch.Tensor, @@ -481,10 +481,10 @@ class PikAdditionsNode(PikaNodeBase): image_bytes_io = tensor_to_bytesio(image) image_bytes_io.seek(0) - pika_files = [ - ("video", ("video.mp4", video_bytes_io, "video/mp4")), - ("image", ("image.png", image_bytes_io, "image/png")), - ] + pika_files = { + "video": ("video.mp4", video_bytes_io, "video/mp4"), + "image": ("image.png", image_bytes_io, "image/png"), + } # Prepare non-file data pika_request_data = PikaBodyGeneratePikadditionsGeneratePikadditionsPost( @@ -506,7 +506,7 @@ class PikAdditionsNode(PikaNodeBase): auth_kwargs=kwargs, ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) class PikaSwapsNode(PikaNodeBase): @@ -558,7 +558,7 @@ class PikaSwapsNode(PikaNodeBase): DESCRIPTION = "Swap out any object or region of your video with a new image or object. Define areas to replace either with a mask or coordinates." RETURN_TYPES = ("VIDEO",) - def api_call( + async def api_call( self, video: VideoInput, image: torch.Tensor, @@ -587,11 +587,11 @@ class PikaSwapsNode(PikaNodeBase): image_bytes_io = tensor_to_bytesio(image) image_bytes_io.seek(0) - pika_files = [ - ("video", ("video.mp4", video_bytes_io, "video/mp4")), - ("image", ("image.png", image_bytes_io, "image/png")), - ("modifyRegionMask", ("mask.png", mask_bytes_io, "image/png")), - ] + pika_files = { + "video": ("video.mp4", video_bytes_io, "video/mp4"), + "image": ("image.png", image_bytes_io, "image/png"), + "modifyRegionMask": ("mask.png", mask_bytes_io, "image/png"), + } # Prepare non-file data pika_request_data = PikaBodyGeneratePikaswapsGeneratePikaswapsPost( @@ -613,7 +613,7 @@ class PikaSwapsNode(PikaNodeBase): auth_kwargs=kwargs, ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) class PikaffectsNode(PikaNodeBase): @@ -664,7 +664,7 @@ class PikaffectsNode(PikaNodeBase): DESCRIPTION = "Generate a video with a specific Pikaffect. Supported Pikaffects: Cake-ify, Crumble, Crush, Decapitate, Deflate, Dissolve, Explode, Eye-pop, Inflate, Levitate, Melt, Peel, Poke, Squish, Ta-da, Tear" - def api_call( + async def api_call( self, image: torch.Tensor, pikaffect: str, @@ -693,7 +693,7 @@ class PikaffectsNode(PikaNodeBase): auth_kwargs=kwargs, ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) class PikaStartEndFrameNode2_2(PikaNodeBase): @@ -718,7 +718,7 @@ class PikaStartEndFrameNode2_2(PikaNodeBase): DESCRIPTION = "Generate a video by combining your first and last frame. Upload two images to define the start and end points, and let the AI create a smooth transition between them." - def api_call( + async def api_call( self, image_start: torch.Tensor, image_end: torch.Tensor, @@ -732,10 +732,7 @@ class PikaStartEndFrameNode2_2(PikaNodeBase): ) -> tuple[VideoFromFile]: pika_files = [ - ( - "keyFrames", - ("image_start.png", tensor_to_bytesio(image_start), "image/png"), - ), + ("keyFrames", ("image_start.png", tensor_to_bytesio(image_start), "image/png")), ("keyFrames", ("image_end.png", tensor_to_bytesio(image_end), "image/png")), ] @@ -758,7 +755,7 @@ class PikaStartEndFrameNode2_2(PikaNodeBase): auth_kwargs=kwargs, ) - return self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) NODE_CLASS_MAPPINGS = { diff --git a/comfy_api_nodes/nodes_pixverse.py b/comfy_api_nodes/nodes_pixverse.py index ef4a9a802..7c5a52feb 100644 --- a/comfy_api_nodes/nodes_pixverse.py +++ b/comfy_api_nodes/nodes_pixverse.py @@ -30,7 +30,7 @@ from comfy.comfy_types.node_typing import IO, ComfyNodeABC from comfy_api.input_impl import VideoFromFile import torch -import requests +import aiohttp from io import BytesIO @@ -47,7 +47,7 @@ def get_video_url_from_response( return str(response.Resp.url) -def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None): +async def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None): # first, upload image to Pixverse and get image id to use in actual generation call files = {"image": tensor_to_bytesio(image)} operation = SynchronousOperation( @@ -62,7 +62,7 @@ def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None): content_type="multipart/form-data", auth_kwargs=auth_kwargs, ) - response_upload: PixverseImageUploadResponse = operation.execute() + response_upload: PixverseImageUploadResponse = await operation.execute() if response_upload.Resp is None: raise Exception( @@ -164,7 +164,7 @@ class PixverseTextToVideoNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, prompt: str, aspect_ratio: str, @@ -205,7 +205,7 @@ class PixverseTextToVideoNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.Resp is None: raise Exception(f"PixVerse request failed: '{response_api.ErrMsg}'") @@ -229,11 +229,11 @@ class PixverseTextToVideoNode(ComfyNodeABC): result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_T2V, ) - response_poll = operation.execute() + response_poll = await operation.execute() - vid_response = requests.get(response_poll.Resp.url) - - return (VideoFromFile(BytesIO(vid_response.content)),) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.Resp.url) as vid_response: + return (VideoFromFile(BytesIO(await vid_response.content.read())),) class PixverseImageToVideoNode(ComfyNodeABC): @@ -302,7 +302,7 @@ class PixverseImageToVideoNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, image: torch.Tensor, prompt: str, @@ -316,7 +316,7 @@ class PixverseImageToVideoNode(ComfyNodeABC): **kwargs, ): validate_string(prompt, strip_whitespace=False) - img_id = upload_image_to_pixverse(image, auth_kwargs=kwargs) + img_id = await upload_image_to_pixverse(image, auth_kwargs=kwargs) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration @@ -345,7 +345,7 @@ class PixverseImageToVideoNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.Resp is None: raise Exception(f"PixVerse request failed: '{response_api.ErrMsg}'") @@ -369,10 +369,11 @@ class PixverseImageToVideoNode(ComfyNodeABC): result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_I2V, ) - response_poll = operation.execute() + response_poll = await operation.execute() - vid_response = requests.get(response_poll.Resp.url) - return (VideoFromFile(BytesIO(vid_response.content)),) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.Resp.url) as vid_response: + return (VideoFromFile(BytesIO(await vid_response.content.read())),) class PixverseTransitionVideoNode(ComfyNodeABC): @@ -436,7 +437,7 @@ class PixverseTransitionVideoNode(ComfyNodeABC): }, } - def api_call( + async def api_call( self, first_frame: torch.Tensor, last_frame: torch.Tensor, @@ -450,8 +451,8 @@ class PixverseTransitionVideoNode(ComfyNodeABC): **kwargs, ): validate_string(prompt, strip_whitespace=False) - first_frame_id = upload_image_to_pixverse(first_frame, auth_kwargs=kwargs) - last_frame_id = upload_image_to_pixverse(last_frame, auth_kwargs=kwargs) + first_frame_id = await upload_image_to_pixverse(first_frame, auth_kwargs=kwargs) + last_frame_id = await upload_image_to_pixverse(last_frame, auth_kwargs=kwargs) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration @@ -480,7 +481,7 @@ class PixverseTransitionVideoNode(ComfyNodeABC): ), auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.Resp is None: raise Exception(f"PixVerse request failed: '{response_api.ErrMsg}'") @@ -504,10 +505,11 @@ class PixverseTransitionVideoNode(ComfyNodeABC): result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_T2V, ) - response_poll = operation.execute() + response_poll = await operation.execute() - vid_response = requests.get(response_poll.Resp.url) - return (VideoFromFile(BytesIO(vid_response.content)),) + async with aiohttp.ClientSession() as session: + async with session.get(response_poll.Resp.url) as vid_response: + return (VideoFromFile(BytesIO(await vid_response.content.read())),) NODE_CLASS_MAPPINGS = { diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index e369c4b7e..c8516b368 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -37,7 +37,7 @@ from io import BytesIO from PIL import UnidentifiedImageError -def handle_recraft_file_request( +async def handle_recraft_file_request( image: torch.Tensor, path: str, mask: torch.Tensor=None, @@ -71,13 +71,13 @@ def handle_recraft_file_request( auth_kwargs=auth_kwargs, multipart_parser=recraft_multipart_parser, ) - response: RecraftImageGenerationResponse = operation.execute() + response: RecraftImageGenerationResponse = await operation.execute() all_bytesio = [] if response.image is not None: - all_bytesio.append(download_url_to_bytesio(response.image.url, timeout=timeout)) + all_bytesio.append(await download_url_to_bytesio(response.image.url, timeout=timeout)) else: for data in response.data: - all_bytesio.append(download_url_to_bytesio(data.url, timeout=timeout)) + all_bytesio.append(await download_url_to_bytesio(data.url, timeout=timeout)) return all_bytesio @@ -395,7 +395,7 @@ class RecraftTextToImageNode: }, } - def api_call( + async def api_call( self, prompt: str, size: str, @@ -439,7 +439,7 @@ class RecraftTextToImageNode: ), auth_kwargs=kwargs, ) - response: RecraftImageGenerationResponse = operation.execute() + response: RecraftImageGenerationResponse = await operation.execute() images = [] urls = [] for data in response.data: @@ -451,7 +451,7 @@ class RecraftTextToImageNode: f"Result URL: {urls_string}", unique_id ) image = bytesio_to_image_tensor( - download_url_to_bytesio(data.url, timeout=1024) + await download_url_to_bytesio(data.url, timeout=1024) ) if len(image.shape) < 4: image = image.unsqueeze(0) @@ -538,7 +538,7 @@ class RecraftImageToImageNode: }, } - def api_call( + async def api_call( self, image: torch.Tensor, prompt: str, @@ -578,7 +578,7 @@ class RecraftImageToImageNode: total = image.shape[0] pbar = ProgressBar(total) for i in range(total): - sub_bytes = handle_recraft_file_request( + sub_bytes = await handle_recraft_file_request( image=image[i], path="/proxy/recraft/images/imageToImage", request=request, @@ -654,7 +654,7 @@ class RecraftImageInpaintingNode: }, } - def api_call( + async def api_call( self, image: torch.Tensor, mask: torch.Tensor, @@ -690,7 +690,7 @@ class RecraftImageInpaintingNode: total = image.shape[0] pbar = ProgressBar(total) for i in range(total): - sub_bytes = handle_recraft_file_request( + sub_bytes = await handle_recraft_file_request( image=image[i], mask=mask[i:i+1], path="/proxy/recraft/images/inpaint", @@ -779,7 +779,7 @@ class RecraftTextToVectorNode: }, } - def api_call( + async def api_call( self, prompt: str, substyle: str, @@ -821,7 +821,7 @@ class RecraftTextToVectorNode: ), auth_kwargs=kwargs, ) - response: RecraftImageGenerationResponse = operation.execute() + response: RecraftImageGenerationResponse = await operation.execute() svg_data = [] urls = [] for data in response.data: @@ -831,7 +831,7 @@ class RecraftTextToVectorNode: PromptServer.instance.send_progress_text( f"Result URL: {' '.join(urls)}", unique_id ) - svg_data.append(download_url_to_bytesio(data.url, timeout=1024)) + svg_data.append(await download_url_to_bytesio(data.url, timeout=1024)) return (SVG(svg_data),) @@ -861,7 +861,7 @@ class RecraftVectorizeImageNode: }, } - def api_call( + async def api_call( self, image: torch.Tensor, **kwargs, @@ -870,7 +870,7 @@ class RecraftVectorizeImageNode: total = image.shape[0] pbar = ProgressBar(total) for i in range(total): - sub_bytes = handle_recraft_file_request( + sub_bytes = await handle_recraft_file_request( image=image[i], path="/proxy/recraft/images/vectorize", auth_kwargs=kwargs, @@ -942,7 +942,7 @@ class RecraftReplaceBackgroundNode: }, } - def api_call( + async def api_call( self, image: torch.Tensor, prompt: str, @@ -973,7 +973,7 @@ class RecraftReplaceBackgroundNode: total = image.shape[0] pbar = ProgressBar(total) for i in range(total): - sub_bytes = handle_recraft_file_request( + sub_bytes = await handle_recraft_file_request( image=image[i], path="/proxy/recraft/images/replaceBackground", request=request, @@ -1011,7 +1011,7 @@ class RecraftRemoveBackgroundNode: }, } - def api_call( + async def api_call( self, image: torch.Tensor, **kwargs, @@ -1020,7 +1020,7 @@ class RecraftRemoveBackgroundNode: total = image.shape[0] pbar = ProgressBar(total) for i in range(total): - sub_bytes = handle_recraft_file_request( + sub_bytes = await handle_recraft_file_request( image=image[i], path="/proxy/recraft/images/removeBackground", auth_kwargs=kwargs, @@ -1062,7 +1062,7 @@ class RecraftCrispUpscaleNode: }, } - def api_call( + async def api_call( self, image: torch.Tensor, **kwargs, @@ -1071,7 +1071,7 @@ class RecraftCrispUpscaleNode: total = image.shape[0] pbar = ProgressBar(total) for i in range(total): - sub_bytes = handle_recraft_file_request( + sub_bytes = await handle_recraft_file_request( image=image[i], path=self.RECRAFT_PATH, auth_kwargs=kwargs, diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index 67f90478c..c89d087e5 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -9,11 +9,10 @@ from __future__ import annotations from inspect import cleandoc from comfy.comfy_types.node_typing import IO import folder_paths as comfy_paths -import requests +import aiohttp import os import datetime -import shutil -import time +import asyncio import io import logging import math @@ -66,7 +65,6 @@ def create_task_error(response: Rodin3DGenerateResponse): return hasattr(response, "error") - class Rodin3DAPI: """ Generate 3D Assets using Rodin API @@ -123,8 +121,8 @@ class Rodin3DAPI: else: return "Generating" - def CreateGenerateTask(self, images=None, seed=1, material="PBR", quality="medium", tier="Regular", mesh_mode="Quad", **kwargs): - if images == None: + async def create_generate_task(self, images=None, seed=1, material="PBR", quality="medium", tier="Regular", mesh_mode="Quad", **kwargs): + if images is None: raise Exception("Rodin 3D generate requires at least 1 image.") if len(images) >= 5: raise Exception("Rodin 3D generate requires up to 5 image.") @@ -155,7 +153,7 @@ class Rodin3DAPI: auth_kwargs=kwargs, ) - response = operation.execute() + response = await operation.execute() if create_task_error(response): error_message = f"Rodin3D Create 3D generate Task Failed. Message: {response.message}, error: {response.error}" @@ -168,7 +166,7 @@ class Rodin3DAPI: logging.info(f"[ Rodin3D API - Submit Jobs ] UUID: {task_uuid}") return task_uuid, subscription_key - def poll_for_task_status(self, subscription_key, **kwargs) -> Rodin3DCheckStatusResponse: + async def poll_for_task_status(self, subscription_key, **kwargs) -> Rodin3DCheckStatusResponse: path = "/proxy/rodin/api/v2/status" @@ -191,11 +189,9 @@ class Rodin3DAPI: logging.info("[ Rodin3D API - CheckStatus ] Generate Start!") - return poll_operation.execute() + return await poll_operation.execute() - - - def GetRodinDownloadList(self, uuid, **kwargs) -> Rodin3DDownloadResponse: + async def get_rodin_download_list(self, uuid, **kwargs) -> Rodin3DDownloadResponse: logging.info("[ Rodin3D API - Downloading ] Generate Successfully!") path = "/proxy/rodin/api/v2/download" @@ -212,53 +208,59 @@ class Rodin3DAPI: auth_kwargs=kwargs ) - return operation.execute() + return await operation.execute() - def GetQualityAndMode(self, PolyCount): - if PolyCount == "200K-Triangle": + def get_quality_mode(self, poly_count): + if poly_count == "200K-Triangle": mesh_mode = "Raw" quality = "medium" else: mesh_mode = "Quad" - if PolyCount == "4K-Quad": + if poly_count == "4K-Quad": quality = "extra-low" - elif PolyCount == "8K-Quad": + elif poly_count == "8K-Quad": quality = "low" - elif PolyCount == "18K-Quad": + elif poly_count == "18K-Quad": quality = "medium" - elif PolyCount == "50K-Quad": + elif poly_count == "50K-Quad": quality = "high" else: quality = "medium" return mesh_mode, quality - def DownLoadFiles(self, Url_List): - Save_path = os.path.join(comfy_paths.get_output_directory(), "Rodin3D", datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) - os.makedirs(Save_path, exist_ok=True) + async def download_files(self, url_list): + save_path = os.path.join(comfy_paths.get_output_directory(), "Rodin3D", datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) + os.makedirs(save_path, exist_ok=True) model_file_path = None - for Item in Url_List.list: - url = Item.url - file_name = Item.name - file_path = os.path.join(Save_path, file_name) - if file_path.endswith(".glb"): - model_file_path = file_path - logging.info(f"[ Rodin3D API - download_files ] Downloading file: {file_path}") - max_retries = 5 - for attempt in range(max_retries): - try: - with requests.get(url, stream=True) as r: - r.raise_for_status() - with open(file_path, "wb") as f: - shutil.copyfileobj(r.raw, f) - break - except Exception as e: - logging.info(f"[ Rodin3D API - download_files ] Error downloading {file_path}:{e}") - if attempt < max_retries - 1: - logging.info("Retrying...") - time.sleep(2) - else: - logging.info(f"[ Rodin3D API - download_files ] Failed to download {file_path} after {max_retries} attempts.") + async with aiohttp.ClientSession() as session: + for i in url_list.list: + url = i.url + file_name = i.name + file_path = os.path.join(save_path, file_name) + if file_path.endswith(".glb"): + model_file_path = file_path + logging.info(f"[ Rodin3D API - download_files ] Downloading file: {file_path}") + max_retries = 5 + for attempt in range(max_retries): + try: + async with session.get(url) as resp: + resp.raise_for_status() + with open(file_path, "wb") as f: + async for chunk in resp.content.iter_chunked(32 * 1024): + f.write(chunk) + break + except Exception as e: + logging.info(f"[ Rodin3D API - download_files ] Error downloading {file_path}:{e}") + if attempt < max_retries - 1: + logging.info("Retrying...") + await asyncio.sleep(2) + else: + logging.info( + "[ Rodin3D API - download_files ] Failed to download %s after %s attempts.", + file_path, + max_retries, + ) return model_file_path @@ -285,7 +287,7 @@ class Rodin3D_Regular(Rodin3DAPI): }, } - def api_call( + async def api_call( self, Images, Seed, @@ -298,14 +300,17 @@ class Rodin3D_Regular(Rodin3DAPI): m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality = self.GetQualityAndMode(Polygon_count) - task_uuid, subscription_key = self.CreateGenerateTask(images=m_images, seed=Seed, material=Material_Type, quality=quality, tier=tier, mesh_mode=mesh_mode, **kwargs) - self.poll_for_task_status(subscription_key, **kwargs) - Download_List = self.GetRodinDownloadList(task_uuid, **kwargs) - model = self.DownLoadFiles(Download_List) + mesh_mode, quality = self.get_quality_mode(Polygon_count) + task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, + quality=quality, tier=tier, mesh_mode=mesh_mode, + **kwargs) + await self.poll_for_task_status(subscription_key, **kwargs) + download_list = await self.get_rodin_download_list(task_uuid, **kwargs) + model = await self.download_files(download_list) return (model,) + class Rodin3D_Detail(Rodin3DAPI): @classmethod def INPUT_TYPES(s): @@ -328,7 +333,7 @@ class Rodin3D_Detail(Rodin3DAPI): }, } - def api_call( + async def api_call( self, Images, Seed, @@ -341,14 +346,17 @@ class Rodin3D_Detail(Rodin3DAPI): m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality = self.GetQualityAndMode(Polygon_count) - task_uuid, subscription_key = self.CreateGenerateTask(images=m_images, seed=Seed, material=Material_Type, quality=quality, tier=tier, mesh_mode=mesh_mode, **kwargs) - self.poll_for_task_status(subscription_key, **kwargs) - Download_List = self.GetRodinDownloadList(task_uuid, **kwargs) - model = self.DownLoadFiles(Download_List) + mesh_mode, quality = self.get_quality_mode(Polygon_count) + task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, + quality=quality, tier=tier, mesh_mode=mesh_mode, + **kwargs) + await self.poll_for_task_status(subscription_key, **kwargs) + download_list = await self.get_rodin_download_list(task_uuid, **kwargs) + model = await self.download_files(download_list) return (model,) + class Rodin3D_Smooth(Rodin3DAPI): @classmethod def INPUT_TYPES(s): @@ -371,7 +379,7 @@ class Rodin3D_Smooth(Rodin3DAPI): }, } - def api_call( + async def api_call( self, Images, Seed, @@ -384,14 +392,17 @@ class Rodin3D_Smooth(Rodin3DAPI): m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality = self.GetQualityAndMode(Polygon_count) - task_uuid, subscription_key = self.CreateGenerateTask(images=m_images, seed=Seed, material=Material_Type, quality=quality, tier=tier, mesh_mode=mesh_mode, **kwargs) - self.poll_for_task_status(subscription_key, **kwargs) - Download_List = self.GetRodinDownloadList(task_uuid, **kwargs) - model = self.DownLoadFiles(Download_List) + mesh_mode, quality = self.get_quality_mode(Polygon_count) + task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, + quality=quality, tier=tier, mesh_mode=mesh_mode, + **kwargs) + await self.poll_for_task_status(subscription_key, **kwargs) + download_list = await self.get_rodin_download_list(task_uuid, **kwargs) + model = await self.download_files(download_list) return (model,) + class Rodin3D_Sketch(Rodin3DAPI): @classmethod def INPUT_TYPES(s): @@ -423,7 +434,7 @@ class Rodin3D_Sketch(Rodin3DAPI): }, } - def api_call( + async def api_call( self, Images, Seed, @@ -437,10 +448,12 @@ class Rodin3D_Sketch(Rodin3DAPI): material_type = "PBR" quality = "medium" mesh_mode = "Quad" - task_uuid, subscription_key = self.CreateGenerateTask(images=m_images, seed=Seed, material=material_type, quality=quality, tier=tier, mesh_mode=mesh_mode, **kwargs) - self.poll_for_task_status(subscription_key, **kwargs) - Download_List = self.GetRodinDownloadList(task_uuid, **kwargs) - model = self.DownLoadFiles(Download_List) + task_uuid, subscription_key = await self.create_generate_task( + images=m_images, seed=Seed, material=material_type, quality=quality, tier=tier, mesh_mode=mesh_mode, **kwargs + ) + await self.poll_for_task_status(subscription_key, **kwargs) + download_list = await self.get_rodin_download_list(task_uuid, **kwargs) + model = await self.download_files(download_list) return (model,) diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py index af4b321f9..98024a9fa 100644 --- a/comfy_api_nodes/nodes_runway.py +++ b/comfy_api_nodes/nodes_runway.py @@ -99,14 +99,14 @@ def validate_input_image(image: torch.Tensor) -> bool: return image.shape[2] < 8000 and image.shape[1] < 8000 -def poll_until_finished( +async def poll_until_finished( auth_kwargs: dict[str, str], api_endpoint: ApiEndpoint[Any, TaskStatusResponse], estimated_duration: Optional[int] = None, node_id: Optional[str] = None, ) -> TaskStatusResponse: """Polls the Runway API endpoint until the task reaches a terminal state, then returns the response.""" - return PollingOperation( + return await PollingOperation( poll_endpoint=api_endpoint, completed_statuses=[ TaskStatus.SUCCEEDED.value, @@ -115,7 +115,7 @@ def poll_until_finished( TaskStatus.FAILED.value, TaskStatus.CANCELLED.value, ], - status_extractor=lambda response: (response.status.value), + status_extractor=lambda response: response.status.value, auth_kwargs=auth_kwargs, result_url_extractor=get_video_url_from_task_status, estimated_duration=estimated_duration, @@ -167,11 +167,11 @@ class RunwayVideoGenNode(ComfyNodeABC): ) return True - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> RunwayImageToVideoResponse: """Poll the task status until it is finished then get the response.""" - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_GET_TASK_STATUS}/{task_id}", @@ -183,7 +183,7 @@ class RunwayVideoGenNode(ComfyNodeABC): node_id=node_id, ) - def generate_video( + async def generate_video( self, request: RunwayImageToVideoRequest, auth_kwargs: dict[str, str], @@ -200,15 +200,15 @@ class RunwayVideoGenNode(ComfyNodeABC): auth_kwargs=auth_kwargs, ) - initial_response = initial_operation.execute() + initial_response = await initial_operation.execute() self.validate_task_created(initial_response) task_id = initial_response.id - final_response = self.get_response(task_id, auth_kwargs, node_id) + final_response = await self.get_response(task_id, auth_kwargs, node_id) self.validate_response(final_response) video_url = get_video_url_from_task_status(final_response) - return (download_url_to_video_output(video_url),) + return (await download_url_to_video_output(video_url),) class RunwayImageToVideoNodeGen3a(RunwayVideoGenNode): @@ -250,7 +250,7 @@ class RunwayImageToVideoNodeGen3a(RunwayVideoGenNode): }, } - def api_call( + async def api_call( self, prompt: str, start_frame: torch.Tensor, @@ -265,7 +265,7 @@ class RunwayImageToVideoNodeGen3a(RunwayVideoGenNode): validate_input_image(start_frame) # Upload image - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( start_frame, max_images=1, mime_type="image/png", @@ -274,7 +274,7 @@ class RunwayImageToVideoNodeGen3a(RunwayVideoGenNode): if len(download_urls) != 1: raise RunwayApiError("Failed to upload one or more images to comfy api.") - return self.generate_video( + return await self.generate_video( RunwayImageToVideoRequest( promptText=prompt, seed=seed, @@ -333,7 +333,7 @@ class RunwayImageToVideoNodeGen4(RunwayVideoGenNode): }, } - def api_call( + async def api_call( self, prompt: str, start_frame: torch.Tensor, @@ -348,7 +348,7 @@ class RunwayImageToVideoNodeGen4(RunwayVideoGenNode): validate_input_image(start_frame) # Upload image - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( start_frame, max_images=1, mime_type="image/png", @@ -357,7 +357,7 @@ class RunwayImageToVideoNodeGen4(RunwayVideoGenNode): if len(download_urls) != 1: raise RunwayApiError("Failed to upload one or more images to comfy api.") - return self.generate_video( + return await self.generate_video( RunwayImageToVideoRequest( promptText=prompt, seed=seed, @@ -382,10 +382,10 @@ class RunwayFirstLastFrameNode(RunwayVideoGenNode): DESCRIPTION = "Upload first and last keyframes, draft a prompt, and generate a video. More complex transitions, such as cases where the Last frame is completely different from the First frame, may benefit from the longer 10s duration. This would give the generation more time to smoothly transition between the two inputs. Before diving in, review these best practices to ensure that your input selections will set your generation up for success: https://help.runwayml.com/hc/en-us/articles/34170748696595-Creating-with-Keyframes-on-Gen-3." - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> RunwayImageToVideoResponse: - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_GET_TASK_STATUS}/{task_id}", @@ -437,7 +437,7 @@ class RunwayFirstLastFrameNode(RunwayVideoGenNode): }, } - def api_call( + async def api_call( self, prompt: str, start_frame: torch.Tensor, @@ -455,7 +455,7 @@ class RunwayFirstLastFrameNode(RunwayVideoGenNode): # Upload images stacked_input_images = image_tensor_pair_to_batch(start_frame, end_frame) - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( stacked_input_images, max_images=2, mime_type="image/png", @@ -464,7 +464,7 @@ class RunwayFirstLastFrameNode(RunwayVideoGenNode): if len(download_urls) != 2: raise RunwayApiError("Failed to upload one or more images to comfy api.") - return self.generate_video( + return await self.generate_video( RunwayImageToVideoRequest( promptText=prompt, seed=seed, @@ -543,11 +543,11 @@ class RunwayTextToImageNode(ComfyNodeABC): ) return True - def get_response( + async def get_response( self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None ) -> TaskStatusResponse: """Poll the task status until it is finished then get the response.""" - return poll_until_finished( + return await poll_until_finished( auth_kwargs, ApiEndpoint( path=f"{PATH_GET_TASK_STATUS}/{task_id}", @@ -559,7 +559,7 @@ class RunwayTextToImageNode(ComfyNodeABC): node_id=node_id, ) - def api_call( + async def api_call( self, prompt: str, ratio: str, @@ -574,7 +574,7 @@ class RunwayTextToImageNode(ComfyNodeABC): reference_images = None if reference_image is not None: validate_input_image(reference_image) - download_urls = upload_images_to_comfyapi( + download_urls = await upload_images_to_comfyapi( reference_image, max_images=1, mime_type="image/png", @@ -605,19 +605,19 @@ class RunwayTextToImageNode(ComfyNodeABC): auth_kwargs=kwargs, ) - initial_response = initial_operation.execute() + initial_response = await initial_operation.execute() self.validate_task_created(initial_response) task_id = initial_response.id # Poll for completion - final_response = self.get_response( + final_response = await self.get_response( task_id, auth_kwargs=kwargs, node_id=unique_id ) self.validate_response(final_response) # Download and return image image_url = get_image_url_from_task_status(final_response) - return (download_url_to_image_tensor(image_url),) + return (await download_url_to_image_tensor(image_url),) NODE_CLASS_MAPPINGS = { diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index 02e421678..31309d831 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -124,7 +124,7 @@ class StabilityStableImageUltraNode: }, } - def api_call(self, prompt: str, aspect_ratio: str, style_preset: str, seed: int, + async def api_call(self, prompt: str, aspect_ratio: str, style_preset: str, seed: int, negative_prompt: str=None, image: torch.Tensor = None, image_denoise: float=None, **kwargs): validate_string(prompt, strip_whitespace=False) @@ -163,7 +163,7 @@ class StabilityStableImageUltraNode: content_type="multipart/form-data", auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stable Image Ultra generation failed: {response_api.finish_reason}.") @@ -257,7 +257,7 @@ class StabilityStableImageSD_3_5Node: }, } - def api_call(self, model: str, prompt: str, aspect_ratio: str, style_preset: str, seed: int, cfg_scale: float, + async def api_call(self, model: str, prompt: str, aspect_ratio: str, style_preset: str, seed: int, cfg_scale: float, negative_prompt: str=None, image: torch.Tensor = None, image_denoise: float=None, **kwargs): validate_string(prompt, strip_whitespace=False) @@ -302,7 +302,7 @@ class StabilityStableImageSD_3_5Node: content_type="multipart/form-data", auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stable Diffusion 3.5 Image generation failed: {response_api.finish_reason}.") @@ -374,7 +374,7 @@ class StabilityUpscaleConservativeNode: }, } - def api_call(self, image: torch.Tensor, prompt: str, creativity: float, seed: int, negative_prompt: str=None, + async def api_call(self, image: torch.Tensor, prompt: str, creativity: float, seed: int, negative_prompt: str=None, **kwargs): validate_string(prompt, strip_whitespace=False) image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read() @@ -403,7 +403,7 @@ class StabilityUpscaleConservativeNode: content_type="multipart/form-data", auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stability Upscale Conservative generation failed: {response_api.finish_reason}.") @@ -480,7 +480,7 @@ class StabilityUpscaleCreativeNode: }, } - def api_call(self, image: torch.Tensor, prompt: str, creativity: float, style_preset: str, seed: int, negative_prompt: str=None, + async def api_call(self, image: torch.Tensor, prompt: str, creativity: float, style_preset: str, seed: int, negative_prompt: str=None, **kwargs): validate_string(prompt, strip_whitespace=False) image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read() @@ -512,7 +512,7 @@ class StabilityUpscaleCreativeNode: content_type="multipart/form-data", auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() operation = PollingOperation( poll_endpoint=ApiEndpoint( @@ -527,7 +527,7 @@ class StabilityUpscaleCreativeNode: status_extractor=lambda x: get_async_dummy_status(x), auth_kwargs=kwargs, ) - response_poll: StabilityResultsGetResponse = operation.execute() + response_poll: StabilityResultsGetResponse = await operation.execute() if response_poll.finish_reason != "SUCCESS": raise Exception(f"Stability Upscale Creative generation failed: {response_poll.finish_reason}.") @@ -563,8 +563,7 @@ class StabilityUpscaleFastNode: }, } - def api_call(self, image: torch.Tensor, - **kwargs): + async def api_call(self, image: torch.Tensor, **kwargs): image_binary = tensor_to_bytesio(image, total_pixels=4096*4096).read() files = { @@ -583,7 +582,7 @@ class StabilityUpscaleFastNode: content_type="multipart/form-data", auth_kwargs=kwargs, ) - response_api = operation.execute() + response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stability Upscale Fast failed: {response_api.finish_reason}.") diff --git a/comfy_api_nodes/nodes_tripo.py b/comfy_api_nodes/nodes_tripo.py index 65f3b21f5..d08cf9007 100644 --- a/comfy_api_nodes/nodes_tripo.py +++ b/comfy_api_nodes/nodes_tripo.py @@ -37,8 +37,8 @@ from comfy_api_nodes.apinode_utils import ( ) -def upload_image_to_tripo(image, **kwargs): - urls = upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs) +async def upload_image_to_tripo(image, **kwargs): + urls = await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs) return TripoFileReference(TripoUrlReference(url=urls[0], type="jpeg")) def get_model_url_from_response(response: TripoTaskResponse) -> str: @@ -49,7 +49,7 @@ def get_model_url_from_response(response: TripoTaskResponse) -> str: raise RuntimeError(f"Failed to get model url from response: {response}") -def poll_until_finished( +async def poll_until_finished( kwargs: dict[str, str], response: TripoTaskResponse, ) -> tuple[str, str]: @@ -57,7 +57,7 @@ def poll_until_finished( if response.code != 0: raise RuntimeError(f"Failed to generate mesh: {response.error}") task_id = response.data.task_id - response_poll = PollingOperation( + response_poll = await PollingOperation( poll_endpoint=ApiEndpoint( path=f"/proxy/tripo/v2/openapi/task/{task_id}", method=HttpMethod.GET, @@ -80,7 +80,7 @@ def poll_until_finished( ).execute() if response_poll.data.status == TripoTaskStatus.SUCCESS: url = get_model_url_from_response(response_poll) - bytesio = download_url_to_bytesio(url) + bytesio = await download_url_to_bytesio(url) # Save the downloaded model file model_file = f"tripo_model_{task_id}.glb" with open(os.path.join(get_output_directory(), model_file), "wb") as f: @@ -88,6 +88,7 @@ def poll_until_finished( return model_file, task_id raise RuntimeError(f"Failed to generate mesh: {response_poll}") + class TripoTextToModelNode: """ Generates 3D models synchronously based on a text prompt using Tripo's API. @@ -126,11 +127,11 @@ class TripoTextToModelNode: API_NODE = True OUTPUT_NODE = True - def generate_mesh(self, prompt, negative_prompt=None, model_version=None, style=None, texture=None, pbr=None, image_seed=None, model_seed=None, texture_seed=None, texture_quality=None, face_limit=None, quad=None, **kwargs): + async def generate_mesh(self, prompt, negative_prompt=None, model_version=None, style=None, texture=None, pbr=None, image_seed=None, model_seed=None, texture_seed=None, texture_quality=None, face_limit=None, quad=None, **kwargs): style_enum = None if style == "None" else style if not prompt: raise RuntimeError("Prompt is required") - response = SynchronousOperation( + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -155,7 +156,8 @@ class TripoTextToModelNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) + class TripoImageToModelNode: """ @@ -195,12 +197,12 @@ class TripoImageToModelNode: API_NODE = True OUTPUT_NODE = True - def generate_mesh(self, image, model_version=None, style=None, texture=None, pbr=None, model_seed=None, orientation=None, texture_alignment=None, texture_seed=None, texture_quality=None, face_limit=None, quad=None, **kwargs): + async def generate_mesh(self, image, model_version=None, style=None, texture=None, pbr=None, model_seed=None, orientation=None, texture_alignment=None, texture_seed=None, texture_quality=None, face_limit=None, quad=None, **kwargs): style_enum = None if style == "None" else style if image is None: raise RuntimeError("Image is required") - tripo_file = upload_image_to_tripo(image, **kwargs) - response = SynchronousOperation( + tripo_file = await upload_image_to_tripo(image, **kwargs) + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -225,7 +227,8 @@ class TripoImageToModelNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) + class TripoMultiviewToModelNode: """ @@ -267,7 +270,7 @@ class TripoMultiviewToModelNode: API_NODE = True OUTPUT_NODE = True - def generate_mesh(self, image, image_left=None, image_back=None, image_right=None, model_version=None, orientation=None, texture=None, pbr=None, model_seed=None, texture_seed=None, texture_quality=None, texture_alignment=None, face_limit=None, quad=None, **kwargs): + async def generate_mesh(self, image, image_left=None, image_back=None, image_right=None, model_version=None, orientation=None, texture=None, pbr=None, model_seed=None, texture_seed=None, texture_quality=None, texture_alignment=None, face_limit=None, quad=None, **kwargs): if image is None: raise RuntimeError("front image for multiview is required") images = [] @@ -282,11 +285,11 @@ class TripoMultiviewToModelNode: for image_name in ["image", "image_left", "image_back", "image_right"]: image_ = image_dict[image_name] if image_ is not None: - tripo_file = upload_image_to_tripo(image_, **kwargs) + tripo_file = await upload_image_to_tripo(image_, **kwargs) images.append(tripo_file) else: images.append(TripoFileEmptyReference()) - response = SynchronousOperation( + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -309,7 +312,8 @@ class TripoMultiviewToModelNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) + class TripoTextureNode: @classmethod @@ -340,8 +344,8 @@ class TripoTextureNode: OUTPUT_NODE = True AVERAGE_DURATION = 80 - def generate_mesh(self, model_task_id, texture=None, pbr=None, texture_seed=None, texture_quality=None, texture_alignment=None, **kwargs): - response = SynchronousOperation( + async def generate_mesh(self, model_task_id, texture=None, pbr=None, texture_seed=None, texture_quality=None, texture_alignment=None, **kwargs): + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -358,7 +362,7 @@ class TripoTextureNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) class TripoRefineNode: @@ -387,8 +391,8 @@ class TripoRefineNode: OUTPUT_NODE = True AVERAGE_DURATION = 240 - def generate_mesh(self, model_task_id, **kwargs): - response = SynchronousOperation( + async def generate_mesh(self, model_task_id, **kwargs): + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -400,7 +404,7 @@ class TripoRefineNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) class TripoRigNode: @@ -425,8 +429,8 @@ class TripoRigNode: OUTPUT_NODE = True AVERAGE_DURATION = 180 - def generate_mesh(self, original_model_task_id, **kwargs): - response = SynchronousOperation( + async def generate_mesh(self, original_model_task_id, **kwargs): + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -440,7 +444,8 @@ class TripoRigNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) + class TripoRetargetNode: @classmethod @@ -475,8 +480,8 @@ class TripoRetargetNode: OUTPUT_NODE = True AVERAGE_DURATION = 30 - def generate_mesh(self, animation, original_model_task_id, **kwargs): - response = SynchronousOperation( + async def generate_mesh(self, animation, original_model_task_id, **kwargs): + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -491,7 +496,8 @@ class TripoRetargetNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) + class TripoConversionNode: @classmethod @@ -529,10 +535,10 @@ class TripoConversionNode: OUTPUT_NODE = True AVERAGE_DURATION = 30 - def generate_mesh(self, original_model_task_id, format, quad, face_limit, texture_size, texture_format, **kwargs): + async def generate_mesh(self, original_model_task_id, format, quad, face_limit, texture_size, texture_format, **kwargs): if not original_model_task_id: raise RuntimeError("original_model_task_id is required") - response = SynchronousOperation( + response = await SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/tripo/v2/openapi/task", method=HttpMethod.POST, @@ -549,7 +555,8 @@ class TripoConversionNode: ), auth_kwargs=kwargs, ).execute() - return poll_until_finished(kwargs, response) + return await poll_until_finished(kwargs, response) + NODE_CLASS_MAPPINGS = { "TripoTextToModelNode": TripoTextToModelNode, diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index 97bfe20e6..e25dab2f5 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -1,7 +1,7 @@ import io import logging import base64 -import requests +import aiohttp import torch from typing import Optional @@ -152,7 +152,7 @@ class VeoVideoGenerationNode(ComfyNodeABC): DESCRIPTION = "Generates videos from text prompts using Google's Veo 2 API" API_NODE = True - def generate_video( + async def generate_video( self, prompt, aspect_ratio="16:9", @@ -217,7 +217,7 @@ class VeoVideoGenerationNode(ComfyNodeABC): auth_kwargs=kwargs, ) - initial_response = initial_operation.execute() + initial_response = await initial_operation.execute() operation_name = initial_response.name logging.info(f"Veo generation started with operation name: {operation_name}") @@ -256,7 +256,7 @@ class VeoVideoGenerationNode(ComfyNodeABC): ) # Execute the polling operation - poll_response = poll_operation.execute() + poll_response = await poll_operation.execute() # Now check for errors in the final response # Check for error in poll response @@ -281,7 +281,6 @@ class VeoVideoGenerationNode(ComfyNodeABC): raise Exception(error_message) # Extract video data - video_data = None if poll_response.response and hasattr(poll_response.response, 'videos') and poll_response.response.videos and len(poll_response.response.videos) > 0: video = poll_response.response.videos[0] @@ -291,9 +290,9 @@ class VeoVideoGenerationNode(ComfyNodeABC): video_data = base64.b64decode(video.bytesBase64Encoded) elif hasattr(video, 'gcsUri') and video.gcsUri: # Download from URL - video_url = video.gcsUri - video_response = requests.get(video_url) - video_data = video_response.content + async with aiohttp.ClientSession() as session: + async with session.get(video.gcsUri) as video_response: + video_data = await video_response.content.read() else: raise Exception("Video returned but no data or URL was provided") else: From 735bb4bdb186bd4f39b9c924c24b8b39a7ef8b0d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 8 Aug 2025 01:21:00 -0700 Subject: [PATCH 111/759] Users report gfx1201 is buggy on flux with pytorch attention. (#9244) --- comfy/model_management.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 9e6149d60..dc5b4711d 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -321,9 +321,9 @@ try: if torch_version_numeric >= (2, 7): # works on 2.6 but doesn't actually seem to improve much if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx950 ENABLE_PYTORCH_ATTENTION = True - if torch_version_numeric >= (2, 8): - if any((a in arch) for a in ["gfx1201"]): - ENABLE_PYTORCH_ATTENTION = True +# if torch_version_numeric >= (2, 8): +# if any((a in arch) for a in ["gfx1201"]): +# ENABLE_PYTORCH_ATTENTION = True if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4): if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]): # TODO: more arches SUPPORT_FP8_OPS = True From 5828607ccfef82a82931d8b66f3fd1176e04588f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 9 Aug 2025 09:49:25 -0700 Subject: [PATCH 112/759] Not sure if AMD actually support fp16 acc but it doesn't crash. (#9258) --- comfy/model_management.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index dc5b4711d..c08f759e5 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -340,7 +340,7 @@ if ENABLE_PYTORCH_ATTENTION: PRIORITIZE_FP16 = False # TODO: remove and replace with something that shows exactly which dtype is faster than the other try: - if is_nvidia() and PerformanceFeature.Fp16Accumulation in args.fast: + if (is_nvidia() or is_amd()) and PerformanceFeature.Fp16Accumulation in args.fast: torch.backends.cuda.matmul.allow_fp16_accumulation = True PRIORITIZE_FP16 = True # TODO: limit to cards where it actually boosts performance logging.info("Enabled fp16 accumulation.") From 0552de7c7d6bcdd515da115d6756fd30494c7ff4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 10 Aug 2025 02:03:47 -0700 Subject: [PATCH 113/759] Bump pytorch cuda and rocm versions in readme instructions. (#9273) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 119098f5c..e4cff01a9 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ Put your VAE in: models/vae ### AMD GPUs (Linux only) AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version: -```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.3``` +```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4``` This is the command to install the nightly with ROCm 6.4 which might have some performance improvements: @@ -237,7 +237,7 @@ Additional discussion and help can be found [here](https://github.com/comfyanony Nvidia users should install stable pytorch using this command: -```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu128``` +```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129``` This is the command to install pytorch nightly instead which might have performance improvements. From 966f3a52061b5e300f36c6de0d07c47d6ad12f76 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 11 Aug 2025 02:53:01 -0700 Subject: [PATCH 114/759] Only show feature flags log when verbose. (#9281) --- server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server.py b/server.py index 0553a0dd7..8f9c88ebf 100644 --- a/server.py +++ b/server.py @@ -235,7 +235,7 @@ class PromptServer(): sid, ) - logging.info( + logging.debug( f"Feature flags negotiated for client {sid}: {client_flags}" ) first_message = False From fa340add552497a264071fd7f6c407ff4aa10449 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 11 Aug 2025 23:48:17 +0300 Subject: [PATCH 115/759] remove creation of non-used asyncio_loop (#9284) --- execution.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/execution.py b/execution.py index 952f0cc5c..1dc35738b 100644 --- a/execution.py +++ b/execution.py @@ -646,8 +646,6 @@ class PromptExecutor: self.add_message("execution_error", mes, broadcast=False) def execute(self, prompt, prompt_id, extra_data={}, execute_outputs=[]): - asyncio_loop = asyncio.new_event_loop() - asyncio.set_event_loop(asyncio_loop) asyncio.run(self.execute_async(prompt, prompt_id, extra_data, execute_outputs)) async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs=[]): From 629b17383718e1f46dbba101ea83ec897fbe3082 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 12 Aug 2025 04:52:12 +0800 Subject: [PATCH 116/759] Update template & embedded docs (#9283) * Update template & embedded docs * Update embedded docs to 0.2.6 --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2f4692b03..2fb38ef27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ comfyui-frontend-package==1.24.4 -comfyui-workflow-templates==0.1.52 -comfyui-embedded-docs==0.2.4 +comfyui-workflow-templates==0.1.53 +comfyui-embedded-docs==0.2.6 torch torchsde torchvision From 2208aa616d3ad193cd37ef57076d4f5243cecdd3 Mon Sep 17 00:00:00 2001 From: PsychoLogicAu Date: Tue, 12 Aug 2025 06:56:16 +1000 Subject: [PATCH 117/759] Support SimpleTuner lycoris lora for Qwen-Image (#9280) --- comfy/lora.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/lora.py b/comfy/lora.py index 6686b7229..00358884b 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -301,6 +301,7 @@ def model_lora_keys_unet(model, key_map={}): key_map["{}".format(key_lora)] = k # Support transformer prefix format key_map["transformer.{}".format(key_lora)] = k + key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k #SimpleTuner lycoris format return key_map From f4231a80b1b904b45ade0def9b37320c4adfe71b Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 12 Aug 2025 00:15:14 +0300 Subject: [PATCH 118/759] fix(Kling Image API Node): do not pass "image_type" when no image (#9271) * fix(Kling Image API Node): do not pass "image_type" when no image * fix(Kling Image API Node): raise client-side error when kling_v1 is used with reference image --- comfy_api_nodes/nodes_kling.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 9d9eb5628..9d483bb0e 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -1690,7 +1690,11 @@ class KlingImageGenerationNode(KlingImageGenerationBase): ): self.validate_prompt(prompt, negative_prompt) - if image is not None: + if image is None: + image_type = None + elif model_name == KlingImageGenModelName.kling_v1: + raise ValueError(f"The model {KlingImageGenModelName.kling_v1.value} does not support reference images.") + else: image = tensor_to_base64_string(image) initial_operation = SynchronousOperation( From 1e3ae1eed8b925430e3b114ea6b7d08ea698e305 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 13 Aug 2025 05:14:27 +0800 Subject: [PATCH 119/759] Update template to 0.1.58 (#9302) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2fb38ef27..82af5690b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.24.4 -comfyui-workflow-templates==0.1.53 +comfyui-workflow-templates==0.1.58 comfyui-embedded-docs==0.2.6 torch torchsde From e1d4f36d8df7446ebb1a5f2bf9c708c38a159f22 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 12 Aug 2025 17:13:04 -0700 Subject: [PATCH 120/759] Update test release package workflow with python 3.13 cu129. (#9306) --- .github/workflows/windows_release_package.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/windows_release_package.yml b/.github/workflows/windows_release_package.yml index 3926a65f3..b51746285 100644 --- a/.github/workflows/windows_release_package.yml +++ b/.github/workflows/windows_release_package.yml @@ -7,19 +7,19 @@ on: description: 'cuda version' required: true type: string - default: "128" + default: "129" python_minor: description: 'python minor version' required: true type: string - default: "12" + default: "13" python_patch: description: 'python patch version' required: true type: string - default: "10" + default: "6" # push: # branches: # - master @@ -64,6 +64,8 @@ jobs: ./python.exe get-pip.py ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/* sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth + + rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space cd .. git clone --depth 1 https://github.com/comfyanonymous/taesd From 560d38f34c5bd532f89f2178f01ee819cf145820 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 12 Aug 2025 20:26:33 -0700 Subject: [PATCH 121/759] Wan2.2 fun control support. (#9292) --- comfy/ldm/wan/model.py | 19 +++++++++++++ comfy/model_base.py | 10 ++++++- comfy/model_detection.py | 5 ++++ comfy_extras/nodes_wan.py | 58 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 1 deletion(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 86d0795e9..4e2d99566 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -391,6 +391,7 @@ class WanModel(torch.nn.Module): cross_attn_norm=True, eps=1e-6, flf_pos_embed_token_number=None, + in_dim_ref_conv=None, image_model=None, device=None, dtype=None, @@ -484,6 +485,11 @@ class WanModel(torch.nn.Module): else: self.img_emb = None + if in_dim_ref_conv is not None: + self.ref_conv = operations.Conv2d(in_dim_ref_conv, dim, kernel_size=patch_size[1:], stride=patch_size[1:], device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + else: + self.ref_conv = None + def forward_orig( self, x, @@ -526,6 +532,13 @@ class WanModel(torch.nn.Module): e = e.reshape(t.shape[0], -1, e.shape[-1]) e0 = self.time_projection(e).unflatten(2, (6, self.dim)) + full_ref = None + if self.ref_conv is not None: + full_ref = kwargs.get("reference_latent", None) + if full_ref is not None: + full_ref = self.ref_conv(full_ref).flatten(2).transpose(1, 2) + x = torch.concat((full_ref, x), dim=1) + # context context = self.text_embedding(context) @@ -552,6 +565,9 @@ class WanModel(torch.nn.Module): # head x = self.head(x, e) + if full_ref is not None: + x = x[:, full_ref.shape[1]:] + # unpatchify x = self.unpatchify(x, grid_sizes) return x @@ -570,6 +586,9 @@ class WanModel(torch.nn.Module): x = torch.cat([x, time_dim_concat], dim=2) t_len = ((x.shape[2] + (patch_size[0] // 2)) // patch_size[0]) + if self.ref_conv is not None and "reference_latent" in kwargs: + t_len += 1 + img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype) img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1) img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1) diff --git a/comfy/model_base.py b/comfy/model_base.py index 8a2d9cbe6..cde61df7c 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1124,7 +1124,11 @@ class WAN21(BaseModel): mask = mask.repeat(1, 4, 1, 1, 1) mask = utils.resize_to_batch_size(mask, noise.shape[0]) - return torch.cat((mask, image), dim=1) + concat_mask_index = kwargs.get("concat_mask_index", 0) + if concat_mask_index != 0: + return torch.cat((image[:, :concat_mask_index], mask, image[:, concat_mask_index:]), dim=1) + else: + return torch.cat((mask, image), dim=1) def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) @@ -1140,6 +1144,10 @@ class WAN21(BaseModel): if time_dim_concat is not None: out['time_dim_concat'] = comfy.conds.CONDRegular(self.process_latent_in(time_dim_concat)) + reference_latents = kwargs.get("reference_latents", None) + if reference_latents is not None: + out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])[:, :, 0]) + return out diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 8b57ebd2f..8acc51e20 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -373,6 +373,11 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): flf_weight = state_dict.get('{}img_emb.emb_pos'.format(key_prefix)) if flf_weight is not None: dit_config["flf_pos_embed_token_number"] = flf_weight.shape[1] + + ref_conv_weight = state_dict.get('{}ref_conv.weight'.format(key_prefix)) + if ref_conv_weight is not None: + dit_config["in_dim_ref_conv"] = ref_conv_weight.shape[1] + return dit_config if '{}latent_in.weight'.format(key_prefix) in state_dict_keys: # Hunyuan 3D diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 0067d054d..f80c83ba6 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -103,6 +103,63 @@ class WanFunControlToVideo: out_latent["samples"] = latent return (positive, negative, out_latent) +class Wan22FunControlToVideo: + @classmethod + def INPUT_TYPES(s): + return {"required": {"positive": ("CONDITIONING", ), + "negative": ("CONDITIONING", ), + "vae": ("VAE", ), + "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), + "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), + "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), + "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), + }, + "optional": {"ref_image": ("IMAGE", ), + "control_video": ("IMAGE", ), + # "start_image": ("IMAGE", ), + }} + + RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") + RETURN_NAMES = ("positive", "negative", "latent") + FUNCTION = "encode" + + CATEGORY = "conditioning/video_models" + + def encode(self, positive, negative, vae, width, height, length, batch_size, ref_image=None, start_image=None, control_video=None): + latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) + concat_latent = concat_latent.repeat(1, 2, 1, 1, 1) + mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1])) + + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + concat_latent_image = vae.encode(start_image[:, :, :, :3]) + concat_latent[:,16:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + mask[:, :, :start_image.shape[0] + 3] = 0.0 + + ref_latent = None + if ref_image is not None: + ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + ref_latent = vae.encode(ref_image[:, :, :, :3]) + + if control_video is not None: + control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + concat_latent_image = vae.encode(control_video[:, :, :, :3]) + concat_latent[:,:16,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + + mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2) + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16}) + + if ref_latent is not None: + positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True) + negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True) + + out_latent = {} + out_latent["samples"] = latent + return (positive, negative, out_latent) + class WanFirstLastFrameToVideo: @classmethod def INPUT_TYPES(s): @@ -733,6 +790,7 @@ NODE_CLASS_MAPPINGS = { "WanTrackToVideo": WanTrackToVideo, "WanImageToVideo": WanImageToVideo, "WanFunControlToVideo": WanFunControlToVideo, + "Wan22FunControlToVideo": Wan22FunControlToVideo, "WanFunInpaintToVideo": WanFunInpaintToVideo, "WanFirstLastFrameToVideo": WanFirstLastFrameToVideo, "WanVaceToVideo": WanVaceToVideo, From 898d88e10e45f38500ca6044280bab4ca2f2d273 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Tue, 12 Aug 2025 20:34:58 -0700 Subject: [PATCH 122/759] Make torchaudio exception catching less specific (#9309) --- comfy_api/latest/_ui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py index 61597038f..26a55615f 100644 --- a/comfy_api/latest/_ui.py +++ b/comfy_api/latest/_ui.py @@ -12,7 +12,7 @@ import torch try: import torchaudio TORCH_AUDIO_AVAILABLE = True -except ImportError: +except: TORCH_AUDIO_AVAILABLE = False from PIL import Image as PILImage from PIL.PngImagePlugin import PngInfo From 3294782d19c3af0c6166aafe0465fe6b59571d17 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 13 Aug 2025 14:50:50 +0800 Subject: [PATCH 123/759] Update template to 0.1.59 (#9313) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 82af5690b..bfb31a73f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.24.4 -comfyui-workflow-templates==0.1.58 +comfyui-workflow-templates==0.1.59 comfyui-embedded-docs==0.2.6 torch torchsde From 5ca8e2fac3b6826261c5991b0663b69eff60b3a1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 13 Aug 2025 00:01:12 -0700 Subject: [PATCH 124/759] Update release workflow to python3.13 pytorch cu129 (#9315) * Try to reduce size of portable even more. * Update stable release workflow to python 3.13 cu129 * Update dependencies workflow to python3.13 cu129 --- .github/workflows/stable-release.yml | 15 ++++++++++----- .../workflows/windows_release_dependencies.yml | 6 +++--- .github/workflows/windows_release_package.yml | 2 ++ 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index 61105abe4..a5a1ed2d0 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -12,17 +12,17 @@ on: description: 'CUDA version' required: true type: string - default: "128" + default: "129" python_minor: description: 'Python minor version' required: true type: string - default: "12" + default: "13" python_patch: description: 'Python patch version' required: true type: string - default: "10" + default: "6" jobs: @@ -66,8 +66,13 @@ jobs: curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py ./python.exe get-pip.py ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/* - sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth - cd .. + sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth + + rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space + rm ./Lib/site-packages/torch/lib/libprotoc.lib + rm ./Lib/site-packages/torch/lib/libprotobuf.lib + + cd .. git clone --depth 1 https://github.com/comfyanonymous/taesd cp taesd/*.safetensors ./ComfyUI_copy/models/vae_approx/ diff --git a/.github/workflows/windows_release_dependencies.yml b/.github/workflows/windows_release_dependencies.yml index dfdb96d50..7761cc1ed 100644 --- a/.github/workflows/windows_release_dependencies.yml +++ b/.github/workflows/windows_release_dependencies.yml @@ -17,19 +17,19 @@ on: description: 'cuda version' required: true type: string - default: "128" + default: "129" python_minor: description: 'python minor version' required: true type: string - default: "12" + default: "13" python_patch: description: 'python patch version' required: true type: string - default: "10" + default: "6" # push: # branches: # - master diff --git a/.github/workflows/windows_release_package.yml b/.github/workflows/windows_release_package.yml index b51746285..3334e6839 100644 --- a/.github/workflows/windows_release_package.yml +++ b/.github/workflows/windows_release_package.yml @@ -66,6 +66,8 @@ jobs: sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space + rm ./Lib/site-packages/torch/lib/libprotoc.lib + rm ./Lib/site-packages/torch/lib/libprotobuf.lib cd .. git clone --depth 1 https://github.com/comfyanonymous/taesd From e400f26c8fc9867248394616a4b58ecc4c53fbfd Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 13 Aug 2025 00:44:54 -0700 Subject: [PATCH 125/759] Downgrade frontend for release. (#9316) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bfb31a73f..56ed85e01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.24.4 +comfyui-frontend-package==1.23.4 comfyui-workflow-templates==0.1.59 comfyui-embedded-docs==0.2.6 torch From d5c1954d5cd4a789bbf84d2b75a955a5a3f93de8 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 13 Aug 2025 03:46:38 -0400 Subject: [PATCH 126/759] ComfyUI version 0.3.50 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 5e2d09c81..29ec07ca6 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.49" +__version__ = "0.3.50" diff --git a/pyproject.toml b/pyproject.toml index 3c530ba85..659b5730a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.49" +version = "0.3.50" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 615eb52049df98cebdd67bc672b66dc059171d7c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 13 Aug 2025 00:48:06 -0700 Subject: [PATCH 127/759] Put back frontend version. (#9317) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 56ed85e01..bfb31a73f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.23.4 +comfyui-frontend-package==1.24.4 comfyui-workflow-templates==0.1.59 comfyui-embedded-docs==0.2.6 torch From afa0a45206832b0e64e38454b7841d1da7ca56e4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 13 Aug 2025 11:42:08 -0700 Subject: [PATCH 128/759] Reduce portable size again. (#9323) * compress more * test * not needed --- .github/workflows/stable-release.yml | 2 +- .github/workflows/windows_release_package.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index a5a1ed2d0..2bc8e5905 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -90,7 +90,7 @@ jobs: cd .. - "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable + "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_nvidia.7z cd ComfyUI_windows_portable diff --git a/.github/workflows/windows_release_package.yml b/.github/workflows/windows_release_package.yml index 3334e6839..46375698e 100644 --- a/.github/workflows/windows_release_package.yml +++ b/.github/workflows/windows_release_package.yml @@ -86,7 +86,7 @@ jobs: cd .. - "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable + "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable mv ComfyUI_windows_portable.7z ComfyUI/new_ComfyUI_windows_portable_nvidia_cu${{ inputs.cu }}_or_cpu.7z cd ComfyUI_windows_portable From 3da5a07510794c37d437cbea1d94065bb0aa8ebc Mon Sep 17 00:00:00 2001 From: contentis Date: Wed, 13 Aug 2025 20:53:27 +0200 Subject: [PATCH 129/759] SDPA backend priority (#9299) --- comfy/ldm/hunyuan3d/vae.py | 2 +- comfy/ldm/modules/attention.py | 4 ++-- comfy/ldm/modules/diffusionmodules/model.py | 2 +- comfy/ops.py | 13 +++++++++++++ 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/comfy/ldm/hunyuan3d/vae.py b/comfy/ldm/hunyuan3d/vae.py index 5eb2c6548..bea6090a2 100644 --- a/comfy/ldm/hunyuan3d/vae.py +++ b/comfy/ldm/hunyuan3d/vae.py @@ -178,7 +178,7 @@ class FourierEmbedder(nn.Module): class CrossAttentionProcessor: def __call__(self, attn, q, k, v): - out = F.scaled_dot_product_attention(q, k, v) + out = ops.scaled_dot_product_attention(q, k, v) return out diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index 35d2270ee..19c3c7af1 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -448,7 +448,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha mask = mask.unsqueeze(1) if SDP_BATCH_LIMIT >= b: - out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False) + out = ops.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False) if not skip_output_reshape: out = ( out.transpose(1, 2).reshape(b, -1, heads * dim_head) @@ -461,7 +461,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha if mask.shape[0] > 1: m = mask[i : i + SDP_BATCH_LIMIT] - out[i : i + SDP_BATCH_LIMIT] = torch.nn.functional.scaled_dot_product_attention( + out[i : i + SDP_BATCH_LIMIT] = ops.scaled_dot_product_attention( q[i : i + SDP_BATCH_LIMIT], k[i : i + SDP_BATCH_LIMIT], v[i : i + SDP_BATCH_LIMIT], diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index 5c0373b74..79160412f 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -285,7 +285,7 @@ def pytorch_attention(q, k, v): ) try: - out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False) + out = ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False) out = out.transpose(2, 3).reshape(orig_shape) except model_management.OOM_EXCEPTION: logging.warning("scaled_dot_product_attention OOMed: switched to slice attention") diff --git a/comfy/ops.py b/comfy/ops.py index 2cc9bbc27..8b7b662b6 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -23,9 +23,18 @@ from comfy.cli_args import args, PerformanceFeature import comfy.float import comfy.rmsnorm import contextlib +from torch.nn.attention import SDPBackend, sdpa_kernel cast_to = comfy.model_management.cast_to #TODO: remove once no more references +SDPA_BACKEND_PRIORITY = [ + SDPBackend.FLASH_ATTENTION, + SDPBackend.EFFICIENT_ATTENTION, + SDPBackend.MATH, +] +if torch.cuda.is_available(): + SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION) + def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) @@ -249,6 +258,10 @@ class disable_weight_init: else: raise ValueError(f"unsupported dimensions: {dims}") + @staticmethod + @sdpa_kernel(backends=SDPA_BACKEND_PRIORITY, set_priority=True) + def scaled_dot_product_attention(q, k, v, *args, **kwargs): + return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) class manual_cast(disable_weight_init): class Linear(disable_weight_init.Linear): From 9df8792d4b894a8ea8034414ef63f70deee4b1af Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 13 Aug 2025 12:12:41 -0700 Subject: [PATCH 130/759] Make last PR not crash comfy on old pytorch. (#9324) --- comfy/ldm/hunyuan3d/vae.py | 2 +- comfy/ldm/modules/attention.py | 4 +-- comfy/ldm/modules/diffusionmodules/model.py | 2 +- comfy/ops.py | 36 +++++++++++++-------- 4 files changed, 27 insertions(+), 17 deletions(-) diff --git a/comfy/ldm/hunyuan3d/vae.py b/comfy/ldm/hunyuan3d/vae.py index bea6090a2..6e8cbf1d9 100644 --- a/comfy/ldm/hunyuan3d/vae.py +++ b/comfy/ldm/hunyuan3d/vae.py @@ -178,7 +178,7 @@ class FourierEmbedder(nn.Module): class CrossAttentionProcessor: def __call__(self, attn, q, k, v): - out = ops.scaled_dot_product_attention(q, k, v) + out = comfy.ops.scaled_dot_product_attention(q, k, v) return out diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index 19c3c7af1..043df28df 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -448,7 +448,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha mask = mask.unsqueeze(1) if SDP_BATCH_LIMIT >= b: - out = ops.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False) + out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False) if not skip_output_reshape: out = ( out.transpose(1, 2).reshape(b, -1, heads * dim_head) @@ -461,7 +461,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha if mask.shape[0] > 1: m = mask[i : i + SDP_BATCH_LIMIT] - out[i : i + SDP_BATCH_LIMIT] = ops.scaled_dot_product_attention( + out[i : i + SDP_BATCH_LIMIT] = comfy.ops.scaled_dot_product_attention( q[i : i + SDP_BATCH_LIMIT], k[i : i + SDP_BATCH_LIMIT], v[i : i + SDP_BATCH_LIMIT], diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index 79160412f..1fd12b35a 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -285,7 +285,7 @@ def pytorch_attention(q, k, v): ) try: - out = ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False) + out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False) out = out.transpose(2, 3).reshape(orig_shape) except model_management.OOM_EXCEPTION: logging.warning("scaled_dot_product_attention OOMed: switched to slice attention") diff --git a/comfy/ops.py b/comfy/ops.py index 8b7b662b6..be312d714 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -23,18 +23,32 @@ from comfy.cli_args import args, PerformanceFeature import comfy.float import comfy.rmsnorm import contextlib -from torch.nn.attention import SDPBackend, sdpa_kernel + + +def scaled_dot_product_attention(q, k, v, *args, **kwargs): + return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) + + +try: + if torch.cuda.is_available(): + from torch.nn.attention import SDPBackend, sdpa_kernel + + SDPA_BACKEND_PRIORITY = [ + SDPBackend.FLASH_ATTENTION, + SDPBackend.EFFICIENT_ATTENTION, + SDPBackend.MATH, + ] + + SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION) + + @sdpa_kernel(backends=SDPA_BACKEND_PRIORITY, set_priority=True) + def scaled_dot_product_attention(q, k, v, *args, **kwargs): + return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) +except (ModuleNotFoundError, TypeError): + logging.warning("Could not set sdpa backend priority.") cast_to = comfy.model_management.cast_to #TODO: remove once no more references -SDPA_BACKEND_PRIORITY = [ - SDPBackend.FLASH_ATTENTION, - SDPBackend.EFFICIENT_ATTENTION, - SDPBackend.MATH, -] -if torch.cuda.is_available(): - SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION) - def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) @@ -258,10 +272,6 @@ class disable_weight_init: else: raise ValueError(f"unsupported dimensions: {dims}") - @staticmethod - @sdpa_kernel(backends=SDPA_BACKEND_PRIORITY, set_priority=True) - def scaled_dot_product_attention(q, k, v, *args, **kwargs): - return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) class manual_cast(disable_weight_init): class Linear(disable_weight_init.Linear): From c991a5da658667cf29f2916bef096fa7b18afd47 Mon Sep 17 00:00:00 2001 From: Simon Lui <502929+simonlui@users.noreply.github.com> Date: Wed, 13 Aug 2025 16:13:35 -0700 Subject: [PATCH 131/759] Fix XPU iGPU regressions (#9322) * Change bf16 check and switch non-blocking to off default with option to force to regain speed on certain classes of iGPUs and refactor xpu check. * Turn non_blocking off by default for xpu. * Update README.md for Intel GPUs. --- README.md | 28 ++++++++++------------------ comfy/cli_args.py | 2 ++ comfy/model_management.py | 21 +++++++++++++-------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index e4cff01a9..fa99a8cbe 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ ComfyUI lets you design and execute advanced stable diffusion pipelines using a ## Get Started #### [Desktop Application](https://www.comfy.org/download) -- The easiest way to get started. +- The easiest way to get started. - Available on Windows & macOS. #### [Windows Portable Package](#installing) @@ -211,27 +211,19 @@ This is the command to install the nightly with ROCm 6.4 which might have some p ### Intel GPUs (Windows and Linux) -(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip (currently available in PyTorch nightly builds). More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html) - -1. To install PyTorch nightly, use the following command: +(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html) + +1. To install PyTorch xpu, use the following command: + +```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu``` + +This is the command to install the Pytorch xpu nightly which might have some performance improvements: ```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu``` -2. Launch ComfyUI by running `python main.py` - - (Option 2) Alternatively, Intel GPUs supported by Intel Extension for PyTorch (IPEX) can leverage IPEX for improved performance. -1. For Intel® Arc™ A-Series Graphics utilizing IPEX, create a conda environment and use the commands below: - -``` -conda install libuv -pip install torch==2.3.1.post0+cxx11.abi torchvision==0.18.1.post0+cxx11.abi torchaudio==2.3.1.post0+cxx11.abi intel-extension-for-pytorch==2.3.110.post0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ -``` - -For other supported Intel GPUs with IPEX, visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information. - -Additional discussion and help can be found [here](https://github.com/comfyanonymous/ComfyUI/discussions/476). +1. visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information. ### NVIDIA @@ -352,7 +344,7 @@ Generate a self-signed certificate (not appropriate for shared/production use) a Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app will now be accessible with `https://...` instead of `http://...`. -> Note: Windows users can use [alexisrolland/docker-openssl](https://github.com/alexisrolland/docker-openssl) or one of the [3rd party binary distributions](https://wiki.openssl.org/index.php/Binaries) to run the command example above. +> Note: Windows users can use [alexisrolland/docker-openssl](https://github.com/alexisrolland/docker-openssl) or one of the [3rd party binary distributions](https://wiki.openssl.org/index.php/Binaries) to run the command example above.

If you use a container, note that the volume mount `-v` can be a relative path so `... -v ".\:/openssl-certs" ...` would create the key & cert files in the current directory of your command prompt or powershell terminal. ## Support and dev channel diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 0d760d524..de3e85c08 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -132,6 +132,8 @@ parser.add_argument("--reserve-vram", type=float, default=None, help="Set the am parser.add_argument("--async-offload", action="store_true", help="Use async weight offloading.") +parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.") + parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.") parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.") diff --git a/comfy/model_management.py b/comfy/model_management.py index c08f759e5..2a9f18068 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -78,7 +78,6 @@ try: torch_version = torch.version.__version__ temp = torch_version.split(".") torch_version_numeric = (int(temp[0]), int(temp[1])) - xpu_available = (torch_version_numeric[0] < 2 or (torch_version_numeric[0] == 2 and torch_version_numeric[1] <= 4)) and torch.xpu.is_available() except: pass @@ -102,10 +101,14 @@ if args.directml is not None: try: import intel_extension_for_pytorch as ipex # noqa: F401 - _ = torch.xpu.device_count() - xpu_available = xpu_available or torch.xpu.is_available() except: - xpu_available = xpu_available or (hasattr(torch, "xpu") and torch.xpu.is_available()) + pass + +try: + _ = torch.xpu.device_count() + xpu_available = torch.xpu.is_available() +except: + xpu_available = False try: if torch.backends.mps.is_available(): @@ -946,10 +949,12 @@ def pick_weight_dtype(dtype, fallback_dtype, device=None): return dtype def device_supports_non_blocking(device): + if args.force_non_blocking: + return True if is_device_mps(device): return False #pytorch bug? mps doesn't support non blocking - if is_intel_xpu(): - return True + if is_intel_xpu(): #xpu does support non blocking but it is slower on iGPUs for some reason so disable by default until situation changes + return False if args.deterministic: #TODO: figure out why deterministic breaks non blocking from gpu to cpu (previews) return False if directml_enabled: @@ -1282,10 +1287,10 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma return False if is_intel_xpu(): - if torch_version_numeric < (2, 6): + if torch_version_numeric < (2, 3): return True else: - return torch.xpu.get_device_capability(device)['has_bfloat16_conversions'] + return torch.xpu.is_bf16_supported() if is_ascend_npu(): return True From e4f7ea105f4b3034593f316560d952b80453e344 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Wed, 13 Aug 2025 18:33:05 -0700 Subject: [PATCH 132/759] Added context window support to core sampling code (#9238) * Added initial support for basic context windows - in progress * Add prepare_sampling wrapper for context window to more accurately estimate latent memory requirements, fixed merging wrappers/callbacks dicts in prepare_model_patcher * Made context windows compatible with different dimensions; works for WAN, but results are bad * Fix comfy.patcher_extension.merge_nested_dicts calls in prepare_model_patcher in sampler_helpers.py * Considering adding some callbacks to context window code to allow extensions of behavior without the need to rewrite code * Made dim slicing cleaner * Add Wan Context WIndows node for testing * Made context schedule and fuse method functions be stored on the handler instead of needing to be registered in core code to be found * Moved some code around between node_context_windows.py and context_windows.py * Change manual context window nodes names/ids * Added callbacks to IndexListContexHandler * Adjusted default values for context_length and context_overlap, made schema.inputs definition for WAN Context Windows less annoying * Make get_resized_cond more robust for various dim sizes * Fix typo * Another small fix --- comfy/context_windows.py | 537 ++++++++++++++++++++++++++ comfy/sampler_helpers.py | 6 +- comfy/samplers.py | 11 +- comfy_extras/nodes_context_windows.py | 89 +++++ nodes.py | 1 + 5 files changed, 639 insertions(+), 5 deletions(-) create mode 100644 comfy/context_windows.py create mode 100644 comfy_extras/nodes_context_windows.py diff --git a/comfy/context_windows.py b/comfy/context_windows.py new file mode 100644 index 000000000..928b111df --- /dev/null +++ b/comfy/context_windows.py @@ -0,0 +1,537 @@ +from __future__ import annotations +from typing import TYPE_CHECKING, Callable +import torch +import numpy as np +import collections +from dataclasses import dataclass +from abc import ABC, abstractmethod +import logging +import comfy.model_management +import comfy.patcher_extension +if TYPE_CHECKING: + from comfy.model_base import BaseModel + from comfy.model_patcher import ModelPatcher + from comfy.controlnet import ControlBase + + +class ContextWindowABC(ABC): + def __init__(self): + ... + + @abstractmethod + def get_tensor(self, full: torch.Tensor) -> torch.Tensor: + """ + Get torch.Tensor applicable to current window. + """ + raise NotImplementedError("Not implemented.") + + @abstractmethod + def add_window(self, full: torch.Tensor, to_add: torch.Tensor) -> torch.Tensor: + """ + Apply torch.Tensor of window to the full tensor, in place. Returns reference to updated full tensor, not a copy. + """ + raise NotImplementedError("Not implemented.") + +class ContextHandlerABC(ABC): + def __init__(self): + ... + + @abstractmethod + def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool: + raise NotImplementedError("Not implemented.") + + @abstractmethod + def get_resized_cond(self, cond_in: list[dict], x_in: torch.Tensor, window: ContextWindowABC, device=None) -> list: + raise NotImplementedError("Not implemented.") + + @abstractmethod + def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]): + raise NotImplementedError("Not implemented.") + + + +class IndexListContextWindow(ContextWindowABC): + def __init__(self, index_list: list[int], dim: int=0): + self.index_list = index_list + self.context_length = len(index_list) + self.dim = dim + + def get_tensor(self, full: torch.Tensor, device=None, dim=None) -> torch.Tensor: + if dim is None: + dim = self.dim + if dim == 0 and full.shape[dim] == 1: + return full + idx = [slice(None)] * dim + [self.index_list] + return full[idx].to(device) + + def add_window(self, full: torch.Tensor, to_add: torch.Tensor, dim=None) -> torch.Tensor: + if dim is None: + dim = self.dim + idx = [slice(None)] * dim + [self.index_list] + full[idx] += to_add + return full + + +class IndexListCallbacks: + EVALUATE_CONTEXT_WINDOWS = "evaluate_context_windows" + COMBINE_CONTEXT_WINDOW_RESULTS = "combine_context_window_results" + EXECUTE_START = "execute_start" + EXECUTE_CLEANUP = "execute_cleanup" + + def init_callbacks(self): + return {} + + +@dataclass +class ContextSchedule: + name: str + func: Callable + +@dataclass +class ContextFuseMethod: + name: str + func: Callable + +ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window']) +class IndexListContextHandler(ContextHandlerABC): + def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1, closed_loop=False, dim=0): + self.context_schedule = context_schedule + self.fuse_method = fuse_method + self.context_length = context_length + self.context_overlap = context_overlap + self.context_stride = context_stride + self.closed_loop = closed_loop + self.dim = dim + self._step = 0 + + self.callbacks = {} + + def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool: + # for now, assume first dim is batch - should have stored on BaseModel in actual implementation + if x_in.size(self.dim) > self.context_length: + logging.info(f"Using context windows {self.context_length} for {x_in.size(self.dim)} frames.") + return True + return False + + def prepare_control_objects(self, control: ControlBase, device=None) -> ControlBase: + if control.previous_controlnet is not None: + self.prepare_control_objects(control.previous_controlnet, device) + return control + + def get_resized_cond(self, cond_in: list[dict], x_in: torch.Tensor, window: IndexListContextWindow, device=None) -> list: + if cond_in is None: + return None + # reuse or resize cond items to match context requirements + resized_cond = [] + # cond object is a list containing a dict - outer list is irrelevant, so just loop through it + for actual_cond in cond_in: + resized_actual_cond = actual_cond.copy() + # now we are in the inner dict - "pooled_output" is a tensor, "control" is a ControlBase object, "model_conds" is dictionary + for key in actual_cond: + try: + cond_item = actual_cond[key] + if isinstance(cond_item, torch.Tensor): + # check that tensor is the expected length - x.size(0) + if self.dim < cond_item.ndim and cond_item.size(self.dim) == x_in.size(self.dim): + # if so, it's subsetting time - tell controls the expected indeces so they can handle them + actual_cond_item = window.get_tensor(cond_item) + resized_actual_cond[key] = actual_cond_item.to(device) + else: + resized_actual_cond[key] = cond_item.to(device) + # look for control + elif key == "control": + resized_actual_cond[key] = self.prepare_control_objects(cond_item, device) + elif isinstance(cond_item, dict): + new_cond_item = cond_item.copy() + # when in dictionary, look for tensors and CONDCrossAttn [comfy/conds.py] (has cond attr that is a tensor) + for cond_key, cond_value in new_cond_item.items(): + if isinstance(cond_value, torch.Tensor): + if cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim): + new_cond_item[cond_key] = window.get_tensor(cond_value, device) + # if has cond that is a Tensor, check if needs to be subset + elif hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor): + if cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim): + new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device)) + elif cond_key == "num_video_frames": # for SVD + new_cond_item[cond_key] = cond_value._copy_with(cond_value.cond) + new_cond_item[cond_key].cond = window.context_length + resized_actual_cond[key] = new_cond_item + else: + resized_actual_cond[key] = cond_item + finally: + del cond_item # just in case to prevent VRAM issues + resized_cond.append(resized_actual_cond) + return resized_cond + + def set_step(self, timestep: torch.Tensor, model_options: dict[str]): + indexes = torch.where(model_options["transformer_options"]["sample_sigmas"] == timestep[0]) + self._step = int(indexes[0]) + + def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]: + full_length = x_in.size(self.dim) # TODO: choose dim based on model + context_windows = self.context_schedule.func(full_length, self, model_options) + context_windows = [IndexListContextWindow(window, dim=self.dim) for window in context_windows] + return context_windows + + def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]): + self.set_step(timestep, model_options) + context_windows = self.get_context_windows(model, x_in, model_options) + enumerated_context_windows = list(enumerate(context_windows)) + + conds_final = [torch.zeros_like(x_in) for _ in conds] + if self.fuse_method.name == ContextFuseMethods.RELATIVE: + counts_final = [torch.ones(get_shape_for_dim(x_in, self.dim), device=x_in.device) for _ in conds] + else: + counts_final = [torch.zeros(get_shape_for_dim(x_in, self.dim), device=x_in.device) for _ in conds] + biases_final = [([0.0] * x_in.shape[self.dim]) for _ in conds] + + for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EXECUTE_START, self.callbacks): + callback(self, model, x_in, conds, timestep, model_options) + + for enum_window in enumerated_context_windows: + results = self.evaluate_context_windows(calc_cond_batch, model, x_in, conds, timestep, [enum_window], model_options) + for result in results: + self.combine_context_window_results(x_in, result.sub_conds_out, result.sub_conds, result.window, result.window_idx, len(enumerated_context_windows), timestep, + conds_final, counts_final, biases_final) + try: + # finalize conds + if self.fuse_method.name == ContextFuseMethods.RELATIVE: + # relative is already normalized, so return as is + del counts_final + return conds_final + else: + # normalize conds via division by context usage counts + for i in range(len(conds_final)): + conds_final[i] /= counts_final[i] + del counts_final + return conds_final + finally: + for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EXECUTE_CLEANUP, self.callbacks): + callback(self, model, x_in, conds, timestep, model_options) + + def evaluate_context_windows(self, calc_cond_batch: Callable, model: BaseModel, x_in: torch.Tensor, conds, timestep: torch.Tensor, enumerated_context_windows: list[tuple[int, IndexListContextWindow]], + model_options, device=None, first_device=None): + results: list[ContextResults] = [] + for window_idx, window in enumerated_context_windows: + # allow processing to end between context window executions for faster Cancel + comfy.model_management.throw_exception_if_processing_interrupted() + + for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EVALUATE_CONTEXT_WINDOWS, self.callbacks): + callback(self, model, x_in, conds, timestep, model_options, window_idx, window, model_options, device, first_device) + + # update exposed params + model_options["transformer_options"]["context_window"] = window + # get subsections of x, timestep, conds + sub_x = window.get_tensor(x_in, device) + sub_timestep = window.get_tensor(timestep, device, dim=0) + sub_conds = [self.get_resized_cond(cond, x_in, window, device) for cond in conds] + + sub_conds_out = calc_cond_batch(model, sub_conds, sub_x, sub_timestep, model_options) + if device is not None: + for i in range(len(sub_conds_out)): + sub_conds_out[i] = sub_conds_out[i].to(x_in.device) + results.append(ContextResults(window_idx, sub_conds_out, sub_conds, window)) + return results + + + def combine_context_window_results(self, x_in: torch.Tensor, sub_conds_out, sub_conds, window: IndexListContextWindow, window_idx: int, total_windows: int, timestep: torch.Tensor, + conds_final: list[torch.Tensor], counts_final: list[torch.Tensor], biases_final: list[torch.Tensor]): + if self.fuse_method.name == ContextFuseMethods.RELATIVE: + for pos, idx in enumerate(window.index_list): + # bias is the influence of a specific index in relation to the whole context window + bias = 1 - abs(idx - (window.index_list[0] + window.index_list[-1]) / 2) / ((window.index_list[-1] - window.index_list[0] + 1e-2) / 2) + bias = max(1e-2, bias) + # take weighted average relative to total bias of current idx + for i in range(len(sub_conds_out)): + bias_total = biases_final[i][idx] + prev_weight = (bias_total / (bias_total + bias)) + new_weight = (bias / (bias_total + bias)) + # account for dims of tensors + idx_window = [slice(None)] * self.dim + [idx] + pos_window = [slice(None)] * self.dim + [pos] + # apply new values + conds_final[i][idx_window] = conds_final[i][idx_window] * prev_weight + sub_conds_out[i][pos_window] * new_weight + biases_final[i][idx] = bias_total + bias + else: + # add conds and counts based on weights of fuse method + weights = get_context_weights(window.context_length, x_in.shape[self.dim], window.index_list, self, sigma=timestep) + weights_tensor = match_weights_to_dim(weights, x_in, self.dim, device=x_in.device) + for i in range(len(sub_conds_out)): + window.add_window(conds_final[i], sub_conds_out[i] * weights_tensor) + window.add_window(counts_final[i], weights_tensor) + + for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.COMBINE_CONTEXT_WINDOW_RESULTS, self.callbacks): + callback(self, x_in, sub_conds_out, sub_conds, window, window_idx, total_windows, timestep, conds_final, counts_final, biases_final) + + +def _prepare_sampling_wrapper(executor, model, noise_shape: torch.Tensor, *args, **kwargs): + # limit noise_shape length to context_length for more accurate vram use estimation + model_options = kwargs.get("model_options", None) + if model_options is None: + raise Exception("model_options not found in prepare_sampling_wrapper; this should never happen, something went wrong.") + handler: IndexListContextHandler = model_options.get("context_handler", None) + if handler is not None: + noise_shape = list(noise_shape) + noise_shape[handler.dim] = min(noise_shape[handler.dim], handler.context_length) + return executor(model, noise_shape, *args, **kwargs) + + +def create_prepare_sampling_wrapper(model: ModelPatcher): + model.add_wrapper_with_key( + comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING, + "ContextWindows_prepare_sampling", + _prepare_sampling_wrapper + ) + + +def match_weights_to_dim(weights: list[float], x_in: torch.Tensor, dim: int, device=None) -> torch.Tensor: + total_dims = len(x_in.shape) + weights_tensor = torch.Tensor(weights).to(device=device) + for _ in range(dim): + weights_tensor = weights_tensor.unsqueeze(0) + for _ in range(total_dims - dim - 1): + weights_tensor = weights_tensor.unsqueeze(-1) + return weights_tensor + +def get_shape_for_dim(x_in: torch.Tensor, dim: int) -> list[int]: + total_dims = len(x_in.shape) + shape = [] + for _ in range(dim): + shape.append(1) + shape.append(x_in.shape[dim]) + for _ in range(total_dims - dim - 1): + shape.append(1) + return shape + +class ContextSchedules: + UNIFORM_LOOPED = "looped_uniform" + UNIFORM_STANDARD = "standard_uniform" + STATIC_STANDARD = "standard_static" + BATCHED = "batched" + + +# from https://github.com/neggles/animatediff-cli/blob/main/src/animatediff/pipelines/context.py +def create_windows_uniform_looped(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]): + windows = [] + if num_frames < handler.context_length: + windows.append(list(range(num_frames))) + return windows + + context_stride = min(handler.context_stride, int(np.ceil(np.log2(num_frames / handler.context_length))) + 1) + # obtain uniform windows as normal, looping and all + for context_step in 1 << np.arange(context_stride): + pad = int(round(num_frames * ordered_halving(handler._step))) + for j in range( + int(ordered_halving(handler._step) * context_step) + pad, + num_frames + pad + (0 if handler.closed_loop else -handler.context_overlap), + (handler.context_length * context_step - handler.context_overlap), + ): + windows.append([e % num_frames for e in range(j, j + handler.context_length * context_step, context_step)]) + + return windows + +def create_windows_uniform_standard(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]): + # unlike looped, uniform_straight does NOT allow windows that loop back to the beginning; + # instead, they get shifted to the corresponding end of the frames. + # in the case that a window (shifted or not) is identical to the previous one, it gets skipped. + windows = [] + if num_frames <= handler.context_length: + windows.append(list(range(num_frames))) + return windows + + context_stride = min(handler.context_stride, int(np.ceil(np.log2(num_frames / handler.context_length))) + 1) + # first, obtain uniform windows as normal, looping and all + for context_step in 1 << np.arange(context_stride): + pad = int(round(num_frames * ordered_halving(handler._step))) + for j in range( + int(ordered_halving(handler._step) * context_step) + pad, + num_frames + pad + (-handler.context_overlap), + (handler.context_length * context_step - handler.context_overlap), + ): + windows.append([e % num_frames for e in range(j, j + handler.context_length * context_step, context_step)]) + + # now that windows are created, shift any windows that loop, and delete duplicate windows + delete_idxs = [] + win_i = 0 + while win_i < len(windows): + # if window is rolls over itself, need to shift it + is_roll, roll_idx = does_window_roll_over(windows[win_i], num_frames) + if is_roll: + roll_val = windows[win_i][roll_idx] # roll_val might not be 0 for windows of higher strides + shift_window_to_end(windows[win_i], num_frames=num_frames) + # check if next window (cyclical) is missing roll_val + if roll_val not in windows[(win_i+1) % len(windows)]: + # need to insert new window here - just insert window starting at roll_val + windows.insert(win_i+1, list(range(roll_val, roll_val + handler.context_length))) + # delete window if it's not unique + for pre_i in range(0, win_i): + if windows[win_i] == windows[pre_i]: + delete_idxs.append(win_i) + break + win_i += 1 + + # reverse delete_idxs so that they will be deleted in an order that doesn't break idx correlation + delete_idxs.reverse() + for i in delete_idxs: + windows.pop(i) + + return windows + + +def create_windows_static_standard(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]): + windows = [] + if num_frames <= handler.context_length: + windows.append(list(range(num_frames))) + return windows + # always return the same set of windows + delta = handler.context_length - handler.context_overlap + for start_idx in range(0, num_frames, delta): + # if past the end of frames, move start_idx back to allow same context_length + ending = start_idx + handler.context_length + if ending >= num_frames: + final_delta = ending - num_frames + final_start_idx = start_idx - final_delta + windows.append(list(range(final_start_idx, final_start_idx + handler.context_length))) + break + windows.append(list(range(start_idx, start_idx + handler.context_length))) + return windows + + +def create_windows_batched(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]): + windows = [] + if num_frames <= handler.context_length: + windows.append(list(range(num_frames))) + return windows + # always return the same set of windows; + # no overlap, just cut up based on context_length; + # last window size will be different if num_frames % opts.context_length != 0 + for start_idx in range(0, num_frames, handler.context_length): + windows.append(list(range(start_idx, min(start_idx + handler.context_length, num_frames)))) + return windows + + +def create_windows_default(num_frames: int, handler: IndexListContextHandler): + return [list(range(num_frames))] + + +CONTEXT_MAPPING = { + ContextSchedules.UNIFORM_LOOPED: create_windows_uniform_looped, + ContextSchedules.UNIFORM_STANDARD: create_windows_uniform_standard, + ContextSchedules.STATIC_STANDARD: create_windows_static_standard, + ContextSchedules.BATCHED: create_windows_batched, +} + + +def get_matching_context_schedule(context_schedule: str) -> ContextSchedule: + func = CONTEXT_MAPPING.get(context_schedule, None) + if func is None: + raise ValueError(f"Unknown context_schedule '{context_schedule}'.") + return ContextSchedule(context_schedule, func) + + +def get_context_weights(length: int, full_length: int, idxs: list[int], handler: IndexListContextHandler, sigma: torch.Tensor=None): + return handler.fuse_method.func(length, sigma=sigma, handler=handler, full_length=full_length, idxs=idxs) + + +def create_weights_flat(length: int, **kwargs) -> list[float]: + # weight is the same for all + return [1.0] * length + +def create_weights_pyramid(length: int, **kwargs) -> list[float]: + # weight is based on the distance away from the edge of the context window; + # based on weighted average concept in FreeNoise paper + if length % 2 == 0: + max_weight = length // 2 + weight_sequence = list(range(1, max_weight + 1, 1)) + list(range(max_weight, 0, -1)) + else: + max_weight = (length + 1) // 2 + weight_sequence = list(range(1, max_weight, 1)) + [max_weight] + list(range(max_weight - 1, 0, -1)) + return weight_sequence + +def create_weights_overlap_linear(length: int, full_length: int, idxs: list[int], handler: IndexListContextHandler, **kwargs): + # based on code in Kijai's WanVideoWrapper: https://github.com/kijai/ComfyUI-WanVideoWrapper/blob/dbb2523b37e4ccdf45127e5ae33e31362f755c8e/nodes.py#L1302 + # only expected overlap is given different weights + weights_torch = torch.ones((length)) + # blend left-side on all except first window + if min(idxs) > 0: + ramp_up = torch.linspace(1e-37, 1, handler.context_overlap) + weights_torch[:handler.context_overlap] = ramp_up + # blend right-side on all except last window + if max(idxs) < full_length-1: + ramp_down = torch.linspace(1, 1e-37, handler.context_overlap) + weights_torch[-handler.context_overlap:] = ramp_down + return weights_torch + +class ContextFuseMethods: + FLAT = "flat" + PYRAMID = "pyramid" + RELATIVE = "relative" + OVERLAP_LINEAR = "overlap-linear" + + LIST = [PYRAMID, FLAT, OVERLAP_LINEAR] + LIST_STATIC = [PYRAMID, RELATIVE, FLAT, OVERLAP_LINEAR] + + +FUSE_MAPPING = { + ContextFuseMethods.FLAT: create_weights_flat, + ContextFuseMethods.PYRAMID: create_weights_pyramid, + ContextFuseMethods.RELATIVE: create_weights_pyramid, + ContextFuseMethods.OVERLAP_LINEAR: create_weights_overlap_linear, +} + +def get_matching_fuse_method(fuse_method: str) -> ContextFuseMethod: + func = FUSE_MAPPING.get(fuse_method, None) + if func is None: + raise ValueError(f"Unknown fuse_method '{fuse_method}'.") + return ContextFuseMethod(fuse_method, func) + +# Returns fraction that has denominator that is a power of 2 +def ordered_halving(val): + # get binary value, padded with 0s for 64 bits + bin_str = f"{val:064b}" + # flip binary value, padding included + bin_flip = bin_str[::-1] + # convert binary to int + as_int = int(bin_flip, 2) + # divide by 1 << 64, equivalent to 2**64, or 18446744073709551616, + # or b10000000000000000000000000000000000000000000000000000000000000000 (1 with 64 zero's) + return as_int / (1 << 64) + + +def get_missing_indexes(windows: list[list[int]], num_frames: int) -> list[int]: + all_indexes = list(range(num_frames)) + for w in windows: + for val in w: + try: + all_indexes.remove(val) + except ValueError: + pass + return all_indexes + + +def does_window_roll_over(window: list[int], num_frames: int) -> tuple[bool, int]: + prev_val = -1 + for i, val in enumerate(window): + val = val % num_frames + if val < prev_val: + return True, i + prev_val = val + return False, -1 + + +def shift_window_to_start(window: list[int], num_frames: int): + start_val = window[0] + for i in range(len(window)): + # 1) subtract each element by start_val to move vals relative to the start of all frames + # 2) add num_frames and take modulus to get adjusted vals + window[i] = ((window[i] - start_val) + num_frames) % num_frames + + +def shift_window_to_end(window: list[int], num_frames: int): + # 1) shift window to start + shift_window_to_start(window, num_frames) + end_val = window[-1] + end_delta = num_frames - end_val - 1 + for i in range(len(window)): + # 2) add end_delta to each val to slide windows to end + window[i] = window[i] + end_delta diff --git a/comfy/sampler_helpers.py b/comfy/sampler_helpers.py index 8dbc41455..e46971afb 100644 --- a/comfy/sampler_helpers.py +++ b/comfy/sampler_helpers.py @@ -149,7 +149,7 @@ def cleanup_models(conds, models): cleanup_additional_models(set(control_cleanup)) -def prepare_model_patcher(model: 'ModelPatcher', conds, model_options: dict): +def prepare_model_patcher(model: ModelPatcher, conds, model_options: dict): ''' Registers hooks from conds. ''' @@ -158,8 +158,8 @@ def prepare_model_patcher(model: 'ModelPatcher', conds, model_options: dict): for k in conds: get_hooks_from_cond(conds[k], hooks) # add wrappers and callbacks from ModelPatcher to transformer_options - model_options["transformer_options"]["wrappers"] = comfy.patcher_extension.copy_nested_dicts(model.wrappers) - model_options["transformer_options"]["callbacks"] = comfy.patcher_extension.copy_nested_dicts(model.callbacks) + comfy.patcher_extension.merge_nested_dicts(model_options["transformer_options"].setdefault("wrappers", {}), model.wrappers, copy_dict1=False) + comfy.patcher_extension.merge_nested_dicts(model_options["transformer_options"].setdefault("callbacks", {}), model.callbacks, copy_dict1=False) # begin registering hooks registered = comfy.hooks.HookGroup() target_dict = comfy.hooks.create_target_dict(comfy.hooks.EnumWeightTarget.Model) diff --git a/comfy/samplers.py b/comfy/samplers.py index ad2f40cdc..d5390d64e 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -16,6 +16,7 @@ import comfy.sampler_helpers import comfy.model_patcher import comfy.patcher_extension import comfy.hooks +import comfy.context_windows import scipy.stats import numpy @@ -198,14 +199,20 @@ def finalize_default_conds(model: 'BaseModel', hooked_to_run: dict[comfy.hooks.H hooked_to_run.setdefault(p.hooks, list()) hooked_to_run[p.hooks] += [(p, i)] -def calc_cond_batch(model: 'BaseModel', conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options): +def calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options: dict[str]): + handler: comfy.context_windows.ContextHandlerABC = model_options.get("context_handler", None) + if handler is None or not handler.should_use_context(model, conds, x_in, timestep, model_options): + return _calc_cond_batch_outer(model, conds, x_in, timestep, model_options) + return handler.execute(_calc_cond_batch_outer, model, conds, x_in, timestep, model_options) + +def _calc_cond_batch_outer(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options): executor = comfy.patcher_extension.WrapperExecutor.new_executor( _calc_cond_batch, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.CALC_COND_BATCH, model_options, is_model_options=True) ) return executor.execute(model, conds, x_in, timestep, model_options) -def _calc_cond_batch(model: 'BaseModel', conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options): +def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options): out_conds = [] out_counts = [] # separate conds by matching hooks diff --git a/comfy_extras/nodes_context_windows.py b/comfy_extras/nodes_context_windows.py new file mode 100644 index 000000000..1c3d9e697 --- /dev/null +++ b/comfy_extras/nodes_context_windows.py @@ -0,0 +1,89 @@ +from __future__ import annotations +from comfy_api.latest import ComfyExtension, io +import comfy.context_windows +import nodes + + +class ContextWindowsManualNode(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="ContextWindowsManual", + display_name="Context Windows (Manual)", + category="context", + description="Manually set context windows.", + inputs=[ + io.Model.Input("model", tooltip="The model to apply context windows to during sampling."), + io.Int.Input("context_length", min=1, default=16, tooltip="The length of the context window."), + io.Int.Input("context_overlap", min=0, default=4, tooltip="The overlap of the context window."), + io.Combo.Input("context_schedule", options=[ + comfy.context_windows.ContextSchedules.STATIC_STANDARD, + comfy.context_windows.ContextSchedules.UNIFORM_STANDARD, + comfy.context_windows.ContextSchedules.UNIFORM_LOOPED, + comfy.context_windows.ContextSchedules.BATCHED, + ], tooltip="The stride of the context window."), + io.Int.Input("context_stride", min=1, default=1, tooltip="The stride of the context window; only applicable to uniform schedules."), + io.Boolean.Input("closed_loop", default=False, tooltip="Whether to close the context window loop; only applicable to looped schedules."), + io.Combo.Input("fuse_method", options=comfy.context_windows.ContextFuseMethods.LIST_STATIC, default=comfy.context_windows.ContextFuseMethods.PYRAMID, tooltip="The method to use to fuse the context windows."), + io.Int.Input("dim", min=0, max=5, default=0, tooltip="The dimension to apply the context windows to."), + ], + outputs=[ + io.Model.Output(tooltip="The model with context windows applied during sampling."), + ], + is_experimental=True, + ) + + @classmethod + def execute(cls, model: io.Model.Type, context_length: int, context_overlap: int, context_schedule: str, context_stride: int, closed_loop: bool, fuse_method: str, dim: int) -> io.Model: + model = model.clone() + model.model_options["context_handler"] = comfy.context_windows.IndexListContextHandler( + context_schedule=comfy.context_windows.get_matching_context_schedule(context_schedule), + fuse_method=comfy.context_windows.get_matching_fuse_method(fuse_method), + context_length=context_length, + context_overlap=context_overlap, + context_stride=context_stride, + closed_loop=closed_loop, + dim=dim) + # make memory usage calculation only take into account the context window latents + comfy.context_windows.create_prepare_sampling_wrapper(model) + return io.NodeOutput(model) + +class WanContextWindowsManualNode(ContextWindowsManualNode): + @classmethod + def define_schema(cls) -> io.Schema: + schema = super().define_schema() + schema.node_id = "WanContextWindowsManual" + schema.display_name = "WAN Context Windows (Manual)" + schema.description = "Manually set context windows for WAN-like models (dim=2)." + schema.inputs = [ + io.Model.Input("model", tooltip="The model to apply context windows to during sampling."), + io.Int.Input("context_length", min=1, max=nodes.MAX_RESOLUTION, step=4, default=81, tooltip="The length of the context window."), + io.Int.Input("context_overlap", min=0, default=30, tooltip="The overlap of the context window."), + io.Combo.Input("context_schedule", options=[ + comfy.context_windows.ContextSchedules.STATIC_STANDARD, + comfy.context_windows.ContextSchedules.UNIFORM_STANDARD, + comfy.context_windows.ContextSchedules.UNIFORM_LOOPED, + comfy.context_windows.ContextSchedules.BATCHED, + ], tooltip="The stride of the context window."), + io.Int.Input("context_stride", min=1, default=1, tooltip="The stride of the context window; only applicable to uniform schedules."), + io.Boolean.Input("closed_loop", default=False, tooltip="Whether to close the context window loop; only applicable to looped schedules."), + io.Combo.Input("fuse_method", options=comfy.context_windows.ContextFuseMethods.LIST_STATIC, default=comfy.context_windows.ContextFuseMethods.PYRAMID, tooltip="The method to use to fuse the context windows."), + ] + return schema + + @classmethod + def execute(cls, model: io.Model.Type, context_length: int, context_overlap: int, context_schedule: str, context_stride: int, closed_loop: bool, fuse_method: str) -> io.Model: + context_length = max(((context_length - 1) // 4) + 1, 1) # at least length 1 + context_overlap = max(((context_overlap - 1) // 4) + 1, 0) # at least overlap 0 + return super().execute(model, context_length, context_overlap, context_schedule, context_stride, closed_loop, fuse_method, dim=2) + + +class ContextWindowsExtension(ComfyExtension): + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + ContextWindowsManualNode, + WanContextWindowsManualNode, + ] + +def comfy_entrypoint(): + return ContextWindowsExtension() diff --git a/nodes.py b/nodes.py index 9448f9c1b..860a236aa 100644 --- a/nodes.py +++ b/nodes.py @@ -2320,6 +2320,7 @@ async def init_builtin_extra_nodes(): "nodes_camera_trajectory.py", "nodes_edit_model.py", "nodes_tcfg.py", + "nodes_context_windows.py", ] import_failed = [] From 72fd4d22b6a4fa11a3f737c9a633e7d635a42181 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 14 Aug 2025 13:03:21 -0700 Subject: [PATCH 133/759] av is an essential dependency. (#9341) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bfb31a73f..551002b5b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,11 +20,11 @@ tqdm psutil alembic SQLAlchemy +av>=14.2.0 #non essential dependencies: kornia>=0.7.1 spandrel soundfile -av>=14.2.0 pydantic~=2.0 pydantic-settings~=2.0 From 644b23ac0b92442b475e44397c62aa8de929d546 Mon Sep 17 00:00:00 2001 From: filtered <176114999+webfiltered@users.noreply.github.com> Date: Fri, 15 Aug 2025 07:36:53 +1000 Subject: [PATCH 134/759] Make custom node testing checkbox optional in issue templates (#9342) The checkbox for confirming custom node testing is now optional in both bug report and user support templates. This allows users to submit issues even if they haven't been able to test with custom nodes disabled, making the reporting process more accessible. --- .github/ISSUE_TEMPLATE/bug-report.yml | 2 +- .github/ISSUE_TEMPLATE/user-support.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 69ce998eb..3cf2717b7 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -22,7 +22,7 @@ body: description: Please confirm you have tried to reproduce the issue with all custom nodes disabled. options: - label: I have tried disabling custom nodes and the issue persists (see [how to disable custom nodes](https://docs.comfy.org/troubleshooting/custom-node-issues#step-1%3A-test-with-all-custom-nodes-disabled) if you need help) - required: true + required: false - type: textarea attributes: label: Expected Behavior diff --git a/.github/ISSUE_TEMPLATE/user-support.yml b/.github/ISSUE_TEMPLATE/user-support.yml index 50657d493..281661f92 100644 --- a/.github/ISSUE_TEMPLATE/user-support.yml +++ b/.github/ISSUE_TEMPLATE/user-support.yml @@ -18,7 +18,7 @@ body: description: Please confirm you have tried to reproduce the issue with all custom nodes disabled. options: - label: I have tried disabling custom nodes and the issue persists (see [how to disable custom nodes](https://docs.comfy.org/troubleshooting/custom-node-issues#step-1%3A-test-with-all-custom-nodes-disabled) if you need help) - required: true + required: false - type: textarea attributes: label: Your question From fa570cbf599657e73c636872616c0b1f8e74f692 Mon Sep 17 00:00:00 2001 From: Yoland Yan <4950057+yoland68@users.noreply.github.com> Date: Thu, 14 Aug 2025 16:44:22 -0700 Subject: [PATCH 135/759] Update CODEOWNERS (#9343) --- CODEOWNERS | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index c4acbf06e..c8acd66d5 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -5,20 +5,21 @@ # Inlined the team members for now. # Maintainers -*.md @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/tests/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/tests-unit/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/notebooks/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/script_examples/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/.github/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/requirements.txt @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne -/pyproject.toml @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne +*.md @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/tests/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/tests-unit/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/notebooks/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/script_examples/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/.github/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/requirements.txt @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill +/pyproject.toml @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill # Python web server -/api_server/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne -/app/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne -/utils/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne +/api_server/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill +/app/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill +/utils/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill # Node developers -/comfy_extras/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne -/comfy/comfy_types/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne +/comfy_extras/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill +/comfy/comfy_types/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill +/comfy_api_nodes/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill From deebee4ff6fd1b2713683d22e5e2e07170daa867 Mon Sep 17 00:00:00 2001 From: guill Date: Thu, 14 Aug 2025 18:46:55 -0700 Subject: [PATCH 136/759] Update default parameters for Moonvalley video nodes (#9290) * Update default parameters for Moonvalley video nodes - Changed default negative prompts to a more extensive list for both BaseMoonvalleyVideoNode and MoonvalleyVideo2VideoNode. - Updated default guidance scale values for both nodes to enhance prompt adherence. - Set a fixed default seed value for consistency in video generation. * no message * ruff fix --------- Co-authored-by: thorsten --- comfy_api_nodes/nodes_moonvalley.py | 128 ++++++++++++++++++++-------- 1 file changed, 91 insertions(+), 37 deletions(-) diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 164ca3ea5..806a70e06 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -1,6 +1,5 @@ import logging from typing import Any, Callable, Optional, TypeVar -import random import torch from comfy_api_nodes.util.validation_utils import ( get_image_dimensions, @@ -208,20 +207,29 @@ def _get_video_dimensions(video: VideoInput) -> tuple[int, int]: def _validate_video_dimensions(width: int, height: int) -> None: """Validates video dimensions meet Moonvalley V2V requirements.""" supported_resolutions = { - (1920, 1080), (1080, 1920), (1152, 1152), - (1536, 1152), (1152, 1536) + (1920, 1080), + (1080, 1920), + (1152, 1152), + (1536, 1152), + (1152, 1536), } if (width, height) not in supported_resolutions: - supported_list = ', '.join([f'{w}x{h}' for w, h in sorted(supported_resolutions)]) - raise ValueError(f"Resolution {width}x{height} not supported. Supported: {supported_list}") + supported_list = ", ".join( + [f"{w}x{h}" for w, h in sorted(supported_resolutions)] + ) + raise ValueError( + f"Resolution {width}x{height} not supported. Supported: {supported_list}" + ) def _validate_container_format(video: VideoInput) -> None: """Validates video container format is MP4.""" container_format = video.get_container_format() - if container_format not in ['mp4', 'mov,mp4,m4a,3gp,3g2,mj2']: - raise ValueError(f"Only MP4 container format supported. Got: {container_format}") + if container_format not in ["mp4", "mov,mp4,m4a,3gp,3g2,mj2"]: + raise ValueError( + f"Only MP4 container format supported. Got: {container_format}" + ) def _validate_and_trim_duration(video: VideoInput) -> VideoInput: @@ -244,7 +252,6 @@ def _trim_if_too_long(video: VideoInput, duration: float) -> VideoInput: return video - def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: """ Returns a new VideoInput object trimmed from the beginning to the specified duration, @@ -302,7 +309,9 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: # Calculate target frame count that's divisible by 16 fps = input_container.streams.video[0].average_rate estimated_frames = int(duration_sec * fps) - target_frames = (estimated_frames // 16) * 16 # Round down to nearest multiple of 16 + target_frames = ( + estimated_frames // 16 + ) * 16 # Round down to nearest multiple of 16 if target_frames == 0: raise ValueError("Video too short: need at least 16 frames for Moonvalley") @@ -424,7 +433,7 @@ class BaseMoonvalleyVideoNode: MoonvalleyTextToVideoInferenceParams, "negative_prompt", multiline=True, - default="low-poly, flat shader, bad rigging, stiff animation, uncanny eyes, low-quality textures, looping glitch, cheap effect, overbloom, bloom spam, default lighting, game asset, stiff face, ugly specular, AI artifacts", + default=" gopro, bright, contrast, static, overexposed, vignette, artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, wobbly, weird, low quality, plastic, stock footage, video camera, boring", ), "resolution": ( IO.COMBO, @@ -441,12 +450,11 @@ class BaseMoonvalleyVideoNode: "tooltip": "Resolution of the output video", }, ), - # "length": (IO.COMBO,{"options":['5s','10s'], "default": '5s'}), "prompt_adherence": model_field_to_node_input( IO.FLOAT, MoonvalleyTextToVideoInferenceParams, "guidance_scale", - default=7.0, + default=10.0, step=1, min=1, max=20, @@ -455,13 +463,12 @@ class BaseMoonvalleyVideoNode: IO.INT, MoonvalleyTextToVideoInferenceParams, "seed", - default=random.randint(0, 2**32 - 1), + default=9, min=0, max=4294967295, step=1, display="number", tooltip="Random seed value", - control_after_generate=True, ), "steps": model_field_to_node_input( IO.INT, @@ -532,9 +539,11 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): # Get MIME type from tensor - assuming PNG format for image tensors mime_type = "image/png" - image_url = (await upload_images_to_comfyapi( - image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type - ))[0] + image_url = ( + await upload_images_to_comfyapi( + image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type + ) + )[0] request = MoonvalleyTextToVideoRequest( image_url=image_url, prompt_text=prompt, inference_params=inference_params @@ -570,17 +579,39 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): return { "required": { "prompt": model_field_to_node_input( - IO.STRING, MoonvalleyVideoToVideoRequest, "prompt_text", - multiline=True + IO.STRING, + MoonvalleyVideoToVideoRequest, + "prompt_text", + multiline=True, ), "negative_prompt": model_field_to_node_input( IO.STRING, MoonvalleyVideoToVideoInferenceParams, "negative_prompt", multiline=True, - default="low-poly, flat shader, bad rigging, stiff animation, uncanny eyes, low-quality textures, looping glitch, cheap effect, overbloom, bloom spam, default lighting, game asset, stiff face, ugly specular, AI artifacts" + default=" gopro, bright, contrast, static, overexposed, vignette, artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, wobbly, weird, low quality, plastic, stock footage, video camera, boring", + ), + "seed": model_field_to_node_input( + IO.INT, + MoonvalleyVideoToVideoInferenceParams, + "seed", + default=9, + min=0, + max=4294967295, + step=1, + display="number", + tooltip="Random seed value", + control_after_generate=False, + ), + "prompt_adherence": model_field_to_node_input( + IO.FLOAT, + MoonvalleyVideoToVideoInferenceParams, + "guidance_scale", + default=10.0, + step=1, + min=1, + max=20, ), - "seed": model_field_to_node_input(IO.INT,MoonvalleyVideoToVideoInferenceParams, "seed", default=random.randint(0, 2**32 - 1), min=0, max=4294967295, step=1, display="number", tooltip="Random seed value", control_after_generate=True), }, "hidden": { "auth_token": "AUTH_TOKEN_COMFY_ORG", @@ -588,7 +619,14 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): "unique_id": "UNIQUE_ID", }, "optional": { - "video": (IO.VIDEO, {"default": "", "multiline": False, "tooltip": "The reference video used to generate the output video. Must be at least 5 seconds long. Videos longer than 5s will be automatically trimmed. Only MP4 format supported."}), + "video": ( + IO.VIDEO, + { + "default": "", + "multiline": False, + "tooltip": "The reference video used to generate the output video. Must be at least 5 seconds long. Videos longer than 5s will be automatically trimmed. Only MP4 format supported.", + }, + ), "control_type": ( ["Motion Transfer", "Pose Transfer"], {"default": "Motion Transfer"}, @@ -602,8 +640,14 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): "max": 100, "tooltip": "Only used if control_type is 'Motion Transfer'", }, - ) - } + ), + "image": model_field_to_node_input( + IO.IMAGE, + MoonvalleyTextToVideoRequest, + "image_url", + tooltip="The reference image used to generate the video", + ), + }, } RETURN_TYPES = ("VIDEO",) @@ -613,6 +657,7 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs ): video = kwargs.get("video") + image = kwargs.get("image", None) if not video: raise MoonvalleyApiError("video is required") @@ -620,8 +665,16 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): video_url = "" if video: validated_video = validate_video_to_video_input(video) - video_url = await upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs) + video_url = await upload_video_to_comfyapi( + validated_video, auth_kwargs=kwargs + ) + mime_type = "image/png" + if not image is None: + validate_input_image(image, with_frame_conditioning=True) + image_url = await upload_images_to_comfyapi( + image=image, auth_kwargs=kwargs, max_images=1, mime_type=mime_type + ) control_type = kwargs.get("control_type") motion_intensity = kwargs.get("motion_intensity") @@ -631,12 +684,12 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): # Only include motion_intensity for Motion Transfer control_params = {} if control_type == "Motion Transfer" and motion_intensity is not None: - control_params['motion_intensity'] = motion_intensity + control_params["motion_intensity"] = motion_intensity - inference_params=MoonvalleyVideoToVideoInferenceParams( + inference_params = MoonvalleyVideoToVideoInferenceParams( negative_prompt=negative_prompt, seed=kwargs.get("seed"), - control_params=control_params + control_params=control_params, ) control = self.parseControlParameter(control_type) @@ -647,6 +700,7 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): prompt_text=prompt, inference_params=inference_params, ) + request.image_url = image_url if not image is None else None initial_operation = SynchronousOperation( endpoint=ApiEndpoint( @@ -694,15 +748,15 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) - inference_params=MoonvalleyTextToVideoInferenceParams( - negative_prompt=negative_prompt, - steps=kwargs.get("steps"), - seed=kwargs.get("seed"), - guidance_scale=kwargs.get("prompt_adherence"), - num_frames=128, - width=width_height.get("width"), - height=width_height.get("height"), - ) + inference_params = MoonvalleyTextToVideoInferenceParams( + negative_prompt=negative_prompt, + steps=kwargs.get("steps"), + seed=kwargs.get("seed"), + guidance_scale=kwargs.get("prompt_adherence"), + num_frames=128, + width=width_height.get("width"), + height=width_height.get("height"), + ) request = MoonvalleyTextToVideoRequest( prompt_text=prompt, inference_params=inference_params ) From 5d65d6753b195d674ce16522d6c34f9a33f36269 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 15 Aug 2025 04:48:41 +0300 Subject: [PATCH 137/759] convert WAN nodes to V3 schema (#9201) --- comfy_extras/nodes_wan.py | 549 +++++++++++++++++++++----------------- 1 file changed, 298 insertions(+), 251 deletions(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index f80c83ba6..694a183f6 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -9,29 +9,35 @@ import comfy.clip_vision import json import numpy as np from typing import Tuple +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class WanImageToVideo: +class WanImageToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ), - "start_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanImageToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) if start_image is not None: start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) @@ -51,32 +57,36 @@ class WanImageToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -class WanFunControlToVideo: +class WanFunControlToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ), - "start_image": ("IMAGE", ), - "control_video": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanFunControlToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + io.Image.Input("control_video", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, control_video=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, control_video=None) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) @@ -101,31 +111,34 @@ class WanFunControlToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -class Wan22FunControlToVideo: +class Wan22FunControlToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"ref_image": ("IMAGE", ), - "control_video": ("IMAGE", ), - # "start_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="Wan22FunControlToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("ref_image", optional=True), + io.Image.Input("control_video", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, ref_image=None, start_image=None, control_video=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, start_image=None, control_video=None) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) @@ -158,32 +171,36 @@ class Wan22FunControlToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -class WanFirstLastFrameToVideo: +class WanFirstLastFrameToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"clip_vision_start_image": ("CLIP_VISION_OUTPUT", ), - "clip_vision_end_image": ("CLIP_VISION_OUTPUT", ), - "start_image": ("IMAGE", ), - "end_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanFirstLastFrameToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_start_image", optional=True), + io.ClipVisionOutput.Input("clip_vision_end_image", optional=True), + io.Image.Input("start_image", optional=True), + io.Image.Input("end_image", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) if start_image is not None: start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) @@ -224,62 +241,70 @@ class WanFirstLastFrameToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -class WanFunInpaintToVideo: +class WanFunInpaintToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ), - "start_image": ("IMAGE", ), - "end_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanFunInpaintToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + io.Image.Input("end_image", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_output=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_output=None) -> io.NodeOutput: flfv = WanFirstLastFrameToVideo() - return flfv.encode(positive, negative, vae, width, height, length, batch_size, start_image=start_image, end_image=end_image, clip_vision_start_image=clip_vision_output) + return flfv.execute(positive, negative, vae, width, height, length, batch_size, start_image=start_image, end_image=end_image, clip_vision_start_image=clip_vision_output) -class WanVaceToVideo: +class WanVaceToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1000.0, "step": 0.01}), - }, - "optional": {"control_video": ("IMAGE", ), - "control_masks": ("MASK", ), - "reference_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanVaceToVideo", + category="conditioning/video_models", + is_experimental=True, + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Float.Input("strength", default=1.0, min=0.0, max=1000.0, step=0.01), + io.Image.Input("control_video", optional=True), + io.Mask.Input("control_masks", optional=True), + io.Image.Input("reference_image", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + io.Int.Output(display_name="trim_latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT", "INT") - RETURN_NAMES = ("positive", "negative", "latent", "trim_latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - EXPERIMENTAL = True - - def encode(self, positive, negative, vae, width, height, length, batch_size, strength, control_video=None, control_masks=None, reference_image=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, strength, control_video=None, control_masks=None, reference_image=None) -> io.NodeOutput: latent_length = ((length - 1) // 4) + 1 if control_video is not None: control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) @@ -336,52 +361,59 @@ class WanVaceToVideo: latent = torch.zeros([batch_size, 16, latent_length, height // 8, width // 8], device=comfy.model_management.intermediate_device()) out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent, trim_latent) + return io.NodeOutput(positive, negative, out_latent, trim_latent) -class TrimVideoLatent: +class TrimVideoLatent(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples": ("LATENT",), - "trim_amount": ("INT", {"default": 0, "min": 0, "max": 99999}), - }} + def define_schema(cls): + return io.Schema( + node_id="TrimVideoLatent", + category="latent/video", + is_experimental=True, + inputs=[ + io.Latent.Input("samples"), + io.Int.Input("trim_amount", default=0, min=0, max=99999), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/video" - - EXPERIMENTAL = True - - def op(self, samples, trim_amount): + @classmethod + def execute(cls, samples, trim_amount) -> io.NodeOutput: samples_out = samples.copy() s1 = samples["samples"] samples_out["samples"] = s1[:, :, trim_amount:] - return (samples_out,) + return io.NodeOutput(samples_out) -class WanCameraImageToVideo: +class WanCameraImageToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"clip_vision_output": ("CLIP_VISION_OUTPUT", ), - "start_image": ("IMAGE", ), - "camera_conditions": ("WAN_CAMERA_EMBEDDING", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanCameraImageToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("start_image", optional=True), + io.WanCameraEmbedding.Input("camera_conditions", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, camera_conditions=None): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None, camera_conditions=None) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) @@ -404,29 +436,34 @@ class WanCameraImageToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -class WanPhantomSubjectToVideo: +class WanPhantomSubjectToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"images": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanPhantomSubjectToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("images", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative_text"), + io.Conditioning.Output(display_name="negative_img_text"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative_text", "negative_img_text", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, width, height, length, batch_size, images): + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, images) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) cond2 = negative if images is not None: @@ -442,7 +479,7 @@ class WanPhantomSubjectToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, cond2, negative, out_latent) + return io.NodeOutput(positive, cond2, negative, out_latent) def parse_json_tracks(tracks): """Parse JSON track data into a standardized format""" @@ -655,39 +692,41 @@ def patch_motion( return out_mask_full, out_feature_full -class WanTrackToVideo: +class WanTrackToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "tracks": ("STRING", {"multiline": True, "default": "[]"}), - "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - "temperature": ("FLOAT", {"default": 220.0, "min": 1.0, "max": 1000.0, "step": 0.1}), - "topk": ("INT", {"default": 2, "min": 1, "max": 10}), - "start_image": ("IMAGE", ), - }, - "optional": { - "clip_vision_output": ("CLIP_VISION_OUTPUT", ), - }} + def define_schema(cls): + return io.Schema( + node_id="WanPhantomSubjectToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.String.Input("tracks", multiline=True, default="[]"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Float.Input("temperature", default=220.0, min=1.0, max=1000.0, step=0.1), + io.Int.Input("topk", default=2, min=1, max=10), + io.Image.Input("start_image"), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, negative, vae, tracks, width, height, length, batch_size, - temperature, topk, start_image=None, clip_vision_output=None): + @classmethod + def execute(cls, positive, negative, vae, tracks, width, height, length, batch_size, + temperature, topk, start_image=None, clip_vision_output=None) -> io.NodeOutput: tracks_data = parse_json_tracks(tracks) if not tracks_data: - return WanImageToVideo().encode(positive, negative, vae, width, height, length, batch_size, start_image=start_image, clip_vision_output=clip_vision_output) + return WanImageToVideo().execute(positive, negative, vae, width, height, length, batch_size, start_image=start_image, clip_vision_output=clip_vision_output) latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) @@ -741,34 +780,36 @@ class WanTrackToVideo: out_latent = {} out_latent["samples"] = latent - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -class Wan22ImageToVideoLatent: +class Wan22ImageToVideoLatent(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"vae": ("VAE", ), - "width": ("INT", {"default": 1280, "min": 32, "max": nodes.MAX_RESOLUTION, "step": 32}), - "height": ("INT", {"default": 704, "min": 32, "max": nodes.MAX_RESOLUTION, "step": 32}), - "length": ("INT", {"default": 49, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"start_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="Wan22ImageToVideoLatent", + category="conditioning/inpaint", + inputs=[ + io.Vae.Input("vae"), + io.Int.Input("width", default=1280, min=32, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("height", default=704, min=32, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("length", default=49, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("start_image", optional=True), + ], + outputs=[ + io.Latent.Output(), + ], + ) - - RETURN_TYPES = ("LATENT",) - FUNCTION = "encode" - - CATEGORY = "conditioning/inpaint" - - def encode(self, vae, width, height, length, batch_size, start_image=None): + @classmethod + def execute(cls, vae, width, height, length, batch_size, start_image=None) -> io.NodeOutput: latent = torch.zeros([1, 48, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device()) if start_image is None: out_latent = {} out_latent["samples"] = latent - return (out_latent,) + return io.NodeOutput(out_latent) mask = torch.ones([latent.shape[0], 1, ((length - 1) // 4) + 1, latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) @@ -783,19 +824,25 @@ class Wan22ImageToVideoLatent: latent = latent_format.process_out(latent) * mask + latent * (1.0 - mask) out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1)) out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1)) - return (out_latent,) + return io.NodeOutput(out_latent) -NODE_CLASS_MAPPINGS = { - "WanTrackToVideo": WanTrackToVideo, - "WanImageToVideo": WanImageToVideo, - "WanFunControlToVideo": WanFunControlToVideo, - "Wan22FunControlToVideo": Wan22FunControlToVideo, - "WanFunInpaintToVideo": WanFunInpaintToVideo, - "WanFirstLastFrameToVideo": WanFirstLastFrameToVideo, - "WanVaceToVideo": WanVaceToVideo, - "TrimVideoLatent": TrimVideoLatent, - "WanCameraImageToVideo": WanCameraImageToVideo, - "WanPhantomSubjectToVideo": WanPhantomSubjectToVideo, - "Wan22ImageToVideoLatent": Wan22ImageToVideoLatent, -} +class WanExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + WanTrackToVideo, + WanImageToVideo, + WanFunControlToVideo, + Wan22FunControlToVideo, + WanFunInpaintToVideo, + WanFirstLastFrameToVideo, + WanVaceToVideo, + TrimVideoLatent, + WanCameraImageToVideo, + WanPhantomSubjectToVideo, + Wan22ImageToVideoLatent, + ] + +async def comfy_entrypoint() -> WanExtension: + return WanExtension() From ad19a069f68a19566632b9bda3e72f4eed8a22d8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 14 Aug 2025 20:16:01 -0700 Subject: [PATCH 138/759] Make SLG nodes work on Qwen Image model. (#9345) --- comfy/ldm/qwen_image/model.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index c15ab8e40..99843f88d 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -356,6 +356,7 @@ class QwenImageTransformer2DModel(nn.Module): context, attention_mask=None, guidance: torch.Tensor = None, + transformer_options={}, **kwargs ): timestep = timesteps @@ -383,14 +384,26 @@ class QwenImageTransformer2DModel(nn.Module): else self.time_text_embed(timestep, guidance, hidden_states) ) - for block in self.transformer_blocks: - encoder_hidden_states, hidden_states = block( - hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - encoder_hidden_states_mask=encoder_hidden_states_mask, - temb=temb, - image_rotary_emb=image_rotary_emb, - ) + patches_replace = transformer_options.get("patches_replace", {}) + blocks_replace = patches_replace.get("dit", {}) + + for i, block in enumerate(self.transformer_blocks): + if ("double_block", i) in blocks_replace: + def block_wrap(args): + out = {} + out["txt"], out["img"] = block(hidden_states=args["img"], encoder_hidden_states=args["txt"], encoder_hidden_states_mask=encoder_hidden_states_mask, temb=args["vec"], image_rotary_emb=args["pe"]) + return out + out = blocks_replace[("double_block", i)]({"img": hidden_states, "txt": encoder_hidden_states, "vec": temb, "pe": image_rotary_emb}, {"original_block": block_wrap}) + hidden_states = out["img"] + encoder_hidden_states = out["txt"] + else: + encoder_hidden_states, hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + encoder_hidden_states_mask=encoder_hidden_states_mask, + temb=temb, + image_rotary_emb=image_rotary_emb, + ) hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) From f0d5d0111f1f78bc8ce5d1f3968f19e40cd2ce7b Mon Sep 17 00:00:00 2001 From: "Xiangxi Guo (Ryan)" Date: Thu, 14 Aug 2025 20:41:37 -0700 Subject: [PATCH 139/759] Avoid torch compile graphbreak for older pytorch versions (#9344) Turns out torch.compile has some gaps in context manager decorator syntax support. I've sent patches to fix that in PyTorch, but it won't be available for all the folks running older versions of PyTorch, hence this trivial patch. --- comfy/ops.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index be312d714..2be35f76a 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -41,9 +41,11 @@ try: SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION) - @sdpa_kernel(backends=SDPA_BACKEND_PRIORITY, set_priority=True) def scaled_dot_product_attention(q, k, v, *args, **kwargs): - return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) + # Use this (rather than the decorator syntax) to eliminate graph + # break for pytorch < 2.9 + with sdpa_kernel(SDPA_BACKEND_PRIORITY, set_priority=True): + return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) except (ModuleNotFoundError, TypeError): logging.warning("Could not set sdpa backend priority.") From 4e5c230f6a957962961794c07f02be748076c771 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 14 Aug 2025 20:44:02 -0700 Subject: [PATCH 140/759] Fix last commit not working on older pytorch. (#9346) --- comfy/ops.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 2be35f76a..18e7db705 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -32,20 +32,21 @@ def scaled_dot_product_attention(q, k, v, *args, **kwargs): try: if torch.cuda.is_available(): from torch.nn.attention import SDPBackend, sdpa_kernel + import inspect + if "set_priority" in inspect.signature(sdpa_kernel).parameters: + SDPA_BACKEND_PRIORITY = [ + SDPBackend.FLASH_ATTENTION, + SDPBackend.EFFICIENT_ATTENTION, + SDPBackend.MATH, + ] - SDPA_BACKEND_PRIORITY = [ - SDPBackend.FLASH_ATTENTION, - SDPBackend.EFFICIENT_ATTENTION, - SDPBackend.MATH, - ] + SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION) - SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION) - - def scaled_dot_product_attention(q, k, v, *args, **kwargs): - # Use this (rather than the decorator syntax) to eliminate graph - # break for pytorch < 2.9 - with sdpa_kernel(SDPA_BACKEND_PRIORITY, set_priority=True): - return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) + def scaled_dot_product_attention(q, k, v, *args, **kwargs): + with sdpa_kernel(SDPA_BACKEND_PRIORITY, set_priority=True): + return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) + else: + logging.warning("Torch version too old to set sdpa backend priority.") except (ModuleNotFoundError, TypeError): logging.warning("Could not set sdpa backend priority.") From e08ecfbd8a9deda8939b14d7f1ff7d7139f1a4ed Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 14 Aug 2025 21:22:26 -0700 Subject: [PATCH 141/759] Add warning when using old pytorch. (#9347) --- comfy/rmsnorm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/rmsnorm.py b/comfy/rmsnorm.py index 66ae8321d..555542a46 100644 --- a/comfy/rmsnorm.py +++ b/comfy/rmsnorm.py @@ -1,6 +1,7 @@ import torch import comfy.model_management import numbers +import logging RMSNorm = None @@ -9,6 +10,7 @@ try: RMSNorm = torch.nn.RMSNorm except: rms_norm_torch = None + logging.warning("Please update pytorch to use native RMSNorm") def rms_norm(x, weight=None, eps=1e-6): From 027c63f63a7f5f380a4df1057c548410b0a87606 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 15 Aug 2025 21:57:47 +0300 Subject: [PATCH 142/759] fix(OpenAIGPTImage1): set correct MIME type for multipart uploads to OpenAI edits (#9348) --- comfy_api_nodes/nodes_openai.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index ab3c5363b..cbff2b2d2 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -464,8 +464,6 @@ class OpenAIGPTImage1(ComfyNodeABC): path = "/proxy/openai/images/generations" content_type = "application/json" request_class = OpenAIImageGenerationRequest - img_binaries = [] - mask_binary = None files = [] if image is not None: @@ -484,14 +482,11 @@ class OpenAIGPTImage1(ComfyNodeABC): img_byte_arr = io.BytesIO() img.save(img_byte_arr, format="PNG") img_byte_arr.seek(0) - img_binary = img_byte_arr - img_binary.name = f"image_{i}.png" - img_binaries.append(img_binary) if batch_size == 1: - files.append(("image", img_binary)) + files.append(("image", (f"image_{i}.png", img_byte_arr, "image/png"))) else: - files.append(("image[]", img_binary)) + files.append(("image[]", (f"image_{i}.png", img_byte_arr, "image/png"))) if mask is not None: if image is None: @@ -511,9 +506,7 @@ class OpenAIGPTImage1(ComfyNodeABC): mask_img_byte_arr = io.BytesIO() mask_img.save(mask_img_byte_arr, format="PNG") mask_img_byte_arr.seek(0) - mask_binary = mask_img_byte_arr - mask_binary.name = "mask.png" - files.append(("mask", mask_binary)) + files.append(("mask", ("mask.png", mask_img_byte_arr, "image/png"))) # Build the operation operation = SynchronousOperation( From c308a8840aebf06649364e8e175862250a2d8823 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 15 Aug 2025 12:50:39 -0700 Subject: [PATCH 143/759] Add FluxKontextMultiReferenceLatentMethod node. (#9356) This node is only useful if someone trains the kontext model to properly use multiple reference images via the index method. The default is the offset method which feeds the multiple images like if they were stitched together as one. This method works with the current flux kontext model. --- comfy/ldm/flux/model.py | 24 ++++++++++++++++-------- comfy/model_base.py | 4 ++++ comfy_extras/nodes_flux.py | 19 +++++++++++++++++++ 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 8f4d99f54..c4de82795 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -224,19 +224,27 @@ class Flux(nn.Module): if ref_latents is not None: h = 0 w = 0 + index = 0 + index_ref_method = kwargs.get("ref_latents_method", "offset") == "index" for ref in ref_latents: - h_offset = 0 - w_offset = 0 - if ref.shape[-2] + h > ref.shape[-1] + w: - w_offset = w + if index_ref_method: + index += 1 + h_offset = 0 + w_offset = 0 else: - h_offset = h + index = 1 + h_offset = 0 + w_offset = 0 + if ref.shape[-2] + h > ref.shape[-1] + w: + w_offset = w + else: + h_offset = h + h = max(h, ref.shape[-2] + h_offset) + w = max(w, ref.shape[-1] + w_offset) - kontext, kontext_ids = self.process_img(ref, index=1, h_offset=h_offset, w_offset=w_offset) + kontext, kontext_ids = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset) img = torch.cat([img, kontext], dim=1) img_ids = torch.cat([img_ids, kontext_ids], dim=1) - h = max(h, ref.shape[-2] + h_offset) - w = max(w, ref.shape[-1] + w_offset) txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype) out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None)) diff --git a/comfy/model_base.py b/comfy/model_base.py index cde61df7c..bf874b875 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -890,6 +890,10 @@ class Flux(BaseModel): for lat in ref_latents: latents.append(self.process_latent_in(lat)) out['ref_latents'] = comfy.conds.CONDList(latents) + + ref_latents_method = kwargs.get("reference_latents_method", None) + if ref_latents_method is not None: + out['ref_latents_method'] = comfy.conds.CONDConstant(ref_latents_method) return out def extra_conds_shapes(self, **kwargs): diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py index 8a8a17698..c8db75bb3 100644 --- a/comfy_extras/nodes_flux.py +++ b/comfy_extras/nodes_flux.py @@ -100,9 +100,28 @@ class FluxKontextImageScale: return (image, ) +class FluxKontextMultiReferenceLatentMethod: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "conditioning": ("CONDITIONING", ), + "reference_latents_method": (("offset", "index"), ), + }} + + RETURN_TYPES = ("CONDITIONING",) + FUNCTION = "append" + EXPERIMENTAL = True + + CATEGORY = "advanced/conditioning/flux" + + def append(self, conditioning, reference_latents_method): + c = node_helpers.conditioning_set_values(conditioning, {"reference_latents_method": reference_latents_method}) + return (c, ) + NODE_CLASS_MAPPINGS = { "CLIPTextEncodeFlux": CLIPTextEncodeFlux, "FluxGuidance": FluxGuidance, "FluxDisableGuidance": FluxDisableGuidance, "FluxKontextImageScale": FluxKontextImageScale, + "FluxKontextMultiReferenceLatentMethod": FluxKontextMultiReferenceLatentMethod, } From 1702e6df16b0a52e147f19e3d5c5548c25a64339 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 15 Aug 2025 14:29:58 -0700 Subject: [PATCH 144/759] Implement wan2.2 camera model. (#9357) Use the old WanCameraImageToVideo node. --- comfy/ldm/wan/model.py | 7 ++++++- comfy/model_detection.py | 5 ++++- comfy/supported_models.py | 14 +++++++++++++- comfy_extras/nodes_wan.py | 7 +++++-- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 4e2d99566..9d3741be3 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -768,7 +768,12 @@ class CameraWanModel(WanModel): operations=None, ): - super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations) + if model_type == 'camera': + model_type = 'i2v' + else: + model_type = 't2v' + + super().__init__(model_type=model_type, patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations) operation_settings = {"operations": operations, "device": device, "dtype": dtype} self.control_adapter = WanCamAdapter(in_dim_control_adapter, dim, kernel_size=patch_size[1:], stride=patch_size[1:], operation_settings=operation_settings) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 8acc51e20..2bec0541e 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -364,7 +364,10 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["vace_in_dim"] = state_dict['{}vace_patch_embedding.weight'.format(key_prefix)].shape[1] dit_config["vace_layers"] = count_blocks(state_dict_keys, '{}vace_blocks.'.format(key_prefix) + '{}.') elif '{}control_adapter.conv.weight'.format(key_prefix) in state_dict_keys: - dit_config["model_type"] = "camera" + if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys: + dit_config["model_type"] = "camera" + else: + dit_config["model_type"] = "camera_2.2" else: if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys: dit_config["model_type"] = "i2v" diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 156ff9e26..7ed6dfd69 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1046,6 +1046,18 @@ class WAN21_Camera(WAN21_T2V): def get_model(self, state_dict, prefix="", device=None): out = model_base.WAN21_Camera(self, image_to_video=False, device=device) return out + +class WAN22_Camera(WAN21_T2V): + unet_config = { + "image_model": "wan2.1", + "model_type": "camera_2.2", + "in_dim": 36, + } + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.WAN21_Camera(self, image_to_video=False, device=device) + return out + class WAN21_Vace(WAN21_T2V): unet_config = { "image_model": "wan2.1", @@ -1260,6 +1272,6 @@ class QwenImage(supported_models_base.BASE): return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 694a183f6..83a990688 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -422,9 +422,12 @@ class WanCameraImageToVideo(io.ComfyNode): start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) concat_latent_image = vae.encode(start_image[:, :, :, :3]) concat_latent[:,:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1])) + mask[:, :, :start_image.shape[0] + 3] = 0.0 + mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2) - positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent}) - negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent}) + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask}) if camera_conditions is not None: positive = node_helpers.conditioning_set_values(positive, {'camera_conditions': camera_conditions}) From ed2e33c69a291094c4fcc13d8426c49844a6363c Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Fri, 15 Aug 2025 20:32:58 -0700 Subject: [PATCH 145/759] bump frontend version to 1.25.8 (#9361) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 551002b5b..2ae44ebe1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.24.4 +comfyui-frontend-package==1.25.8 comfyui-workflow-templates==0.1.59 comfyui-embedded-docs==0.2.6 torch From 20a84166d0d37dd6833caa6cadf3bfac8c241b48 Mon Sep 17 00:00:00 2001 From: Terry Jia Date: Sat, 16 Aug 2025 02:07:12 -0400 Subject: [PATCH 146/759] record audio node (#8716) * record audio node * sf --- comfy_extras/nodes_audio.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index a90b31779..3b23f65d8 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -346,6 +346,24 @@ class LoadAudio: return "Invalid audio file: {}".format(audio) return True +class RecordAudio: + @classmethod + def INPUT_TYPES(s): + return {"required": {"audio": ("AUDIO_RECORD", {})}} + + CATEGORY = "audio" + + RETURN_TYPES = ("AUDIO", ) + FUNCTION = "load" + + def load(self, audio): + audio_path = folder_paths.get_annotated_filepath(audio) + + waveform, sample_rate = torchaudio.load(audio_path) + audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate} + return (audio, ) + + NODE_CLASS_MAPPINGS = { "EmptyLatentAudio": EmptyLatentAudio, "VAEEncodeAudio": VAEEncodeAudio, @@ -356,6 +374,7 @@ NODE_CLASS_MAPPINGS = { "LoadAudio": LoadAudio, "PreviewAudio": PreviewAudio, "ConditioningStableAudio": ConditioningStableAudio, + "RecordAudio": RecordAudio, } NODE_DISPLAY_NAME_MAPPINGS = { @@ -367,4 +386,5 @@ NODE_DISPLAY_NAME_MAPPINGS = { "SaveAudio": "Save Audio (FLAC)", "SaveAudioMP3": "Save Audio (MP3)", "SaveAudioOpus": "Save Audio (Opus)", + "RecordAudio": "Record Audio", } From 0f2b8525bcafe213e8421a49564a90f926e81f2e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 16 Aug 2025 14:51:28 -0700 Subject: [PATCH 147/759] Qwen image model refactor. (#9375) --- comfy/ldm/qwen_image/model.py | 36 +++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 99843f88d..40d8fd979 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -333,21 +333,25 @@ class QwenImageTransformer2DModel(nn.Module): self.proj_out = operations.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, dtype=dtype, device=device) self.gradient_checkpointing = False - def pos_embeds(self, x, context): + def process_img(self, x, index=0, h_offset=0, w_offset=0): bs, c, t, h, w = x.shape patch_size = self.patch_size + hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (1, self.patch_size, self.patch_size)) + orig_shape = hidden_states.shape + hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) + hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5) + hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4) h_len = ((h + (patch_size // 2)) // patch_size) w_len = ((w + (patch_size // 2)) // patch_size) - img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) - img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) - img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs) + h_offset = ((h_offset + (patch_size // 2)) // patch_size) + w_offset = ((w_offset + (patch_size // 2)) // patch_size) - txt_start = round(max(h_len, w_len)) - txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(bs, 1, 3) - ids = torch.cat((txt_ids, img_ids), dim=1) - return self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) + img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) + img_ids[:, :, 0] = img_ids[:, :, 1] + index + img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) + img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) + return hidden_states, repeat(img_ids, "h w c -> b (h w) c", b=bs), orig_shape def forward( self, @@ -363,13 +367,13 @@ class QwenImageTransformer2DModel(nn.Module): encoder_hidden_states = context encoder_hidden_states_mask = attention_mask - image_rotary_emb = self.pos_embeds(x, context) + hidden_states, img_ids, orig_shape = self.process_img(x) + num_embeds = hidden_states.shape[1] - hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (1, self.patch_size, self.patch_size)) - orig_shape = hidden_states.shape - hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) - hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5) - hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4) + txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size), ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size))) + txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) + ids = torch.cat((txt_ids, img_ids), dim=1) + image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) hidden_states = self.img_in(hidden_states) encoder_hidden_states = self.txt_norm(encoder_hidden_states) @@ -408,6 +412,6 @@ class QwenImageTransformer2DModel(nn.Module): hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) - hidden_states = hidden_states.view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2) + hidden_states = hidden_states[:, :num_embeds].view(orig_shape[0], orig_shape[-2] // 2, orig_shape[-1] // 2, orig_shape[1], 2, 2) hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5) return hidden_states.reshape(orig_shape)[:, :, :, :x.shape[-2], :x.shape[-1]] From ed43784b0d04e5b8e8ff2c057fa84b9c5132aaf2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 17 Aug 2025 13:45:39 -0700 Subject: [PATCH 148/759] WIP Qwen edit model: The diffusion model part. (#9383) --- comfy/ldm/qwen_image/model.py | 26 ++++++++++++++++++++++++++ comfy/model_base.py | 10 ++++++++++ 2 files changed, 36 insertions(+) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 40d8fd979..a3c726299 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -360,6 +360,7 @@ class QwenImageTransformer2DModel(nn.Module): context, attention_mask=None, guidance: torch.Tensor = None, + ref_latents=None, transformer_options={}, **kwargs ): @@ -370,6 +371,31 @@ class QwenImageTransformer2DModel(nn.Module): hidden_states, img_ids, orig_shape = self.process_img(x) num_embeds = hidden_states.shape[1] + if ref_latents is not None: + h = 0 + w = 0 + index = 0 + index_ref_method = kwargs.get("ref_latents_method", "index") == "index" + for ref in ref_latents: + if index_ref_method: + index += 1 + h_offset = 0 + w_offset = 0 + else: + index = 1 + h_offset = 0 + w_offset = 0 + if ref.shape[-2] + h > ref.shape[-1] + w: + w_offset = w + else: + h_offset = h + h = max(h, ref.shape[-2] + h_offset) + w = max(w, ref.shape[-1] + w_offset) + + kontext, kontext_ids, _ = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset) + hidden_states = torch.cat([hidden_states, kontext], dim=1) + img_ids = torch.cat([img_ids, kontext_ids], dim=1) + txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size), ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size))) txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) ids = torch.cat((txt_ids, img_ids), dim=1) diff --git a/comfy/model_base.py b/comfy/model_base.py index bf874b875..15bd7abef 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1331,4 +1331,14 @@ class QwenImage(BaseModel): cross_attn = kwargs.get("cross_attn", None) if cross_attn is not None: out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + ref_latents = kwargs.get("reference_latents", None) + if ref_latents is not None: + latents = [] + for lat in ref_latents: + latents.append(self.process_latent_in(lat)) + out['ref_latents'] = comfy.conds.CONDList(latents) + + ref_latents_method = kwargs.get("reference_latents_method", None) + if ref_latents_method is not None: + out['ref_latents_method'] = comfy.conds.CONDConstant(ref_latents_method) return out From d4e353a94ec5a8cb15ed151990a9518b890e5d4f Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Mon, 18 Aug 2025 05:38:40 +0800 Subject: [PATCH 149/759] Update template to 0.1.60 (#9377) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2ae44ebe1..72a700028 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.8 -comfyui-workflow-templates==0.1.59 +comfyui-workflow-templates==0.1.60 comfyui-embedded-docs==0.2.6 torch torchsde From 7f3b9b16c6636cb1201213574892d33c2a35e4ba Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Sun, 17 Aug 2025 15:54:07 -0700 Subject: [PATCH 150/759] Make step index detection much more robust (#9392) --- comfy/context_windows.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/comfy/context_windows.py b/comfy/context_windows.py index 928b111df..041f380f9 100644 --- a/comfy/context_windows.py +++ b/comfy/context_windows.py @@ -164,8 +164,11 @@ class IndexListContextHandler(ContextHandlerABC): return resized_cond def set_step(self, timestep: torch.Tensor, model_options: dict[str]): - indexes = torch.where(model_options["transformer_options"]["sample_sigmas"] == timestep[0]) - self._step = int(indexes[0]) + mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep, rtol=0.0001) + matches = torch.nonzero(mask) + if torch.numel(matches) == 0: + raise Exception("No sample_sigmas matched current timestep; something went wrong.") + self._step = int(matches[0].item()) def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]: full_length = x_in.size(self.dim) # TODO: choose dim based on model From da2efeaec6609265051165bfb413a2a4c84cf4bb Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Sun, 17 Aug 2025 20:21:02 -0700 Subject: [PATCH 151/759] Bump frontend to 1.25.9 (#9394) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 72a700028..c7a5c47ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.25.8 +comfyui-frontend-package==1.25.9 comfyui-workflow-templates==0.1.60 comfyui-embedded-docs==0.2.6 torch From bd2ab73976a4e245db3e057795328c89bfd98a88 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 18 Aug 2025 10:26:55 +0300 Subject: [PATCH 152/759] fix(WAN-nodes): invalid nodeid for WanTrackToVideo (#9396) --- comfy_extras/nodes_wan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 83a990688..0fff02f76 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -699,7 +699,7 @@ class WanTrackToVideo(io.ComfyNode): @classmethod def define_schema(cls): return io.Schema( - node_id="WanPhantomSubjectToVideo", + node_id="WanTrackToVideo", category="conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), From 4977f203fa8e9e3ab22884c8ace8f9b540d48952 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 18 Aug 2025 19:38:34 -0700 Subject: [PATCH 153/759] P2 of qwen edit model. (#9412) * P2 of qwen edit model. * Typo. * Fix normal qwen. * Fix. * Make the TextEncodeQwenImageEdit also set the ref latent. If you don't want it to set the ref latent and want to use the ReferenceLatent node with your custom latent instead just disconnect the VAE. --- comfy/clip_model.py | 2 +- comfy/model_base.py | 8 + comfy/sd1_clip.py | 11 +- comfy/text_encoders/bert.py | 2 +- comfy/text_encoders/llama.py | 43 ++- comfy/text_encoders/qwen_image.py | 20 +- comfy/text_encoders/qwen_vl.py | 428 ++++++++++++++++++++++++++++++ comfy/text_encoders/t5.py | 2 +- comfy_extras/nodes_qwen.py | 63 +++++ nodes.py | 1 + 10 files changed, 565 insertions(+), 15 deletions(-) create mode 100644 comfy/text_encoders/qwen_vl.py create mode 100644 comfy_extras/nodes_qwen.py diff --git a/comfy/clip_model.py b/comfy/clip_model.py index c8294d483..7e47d8a55 100644 --- a/comfy/clip_model.py +++ b/comfy/clip_model.py @@ -97,7 +97,7 @@ class CLIPTextModel_(torch.nn.Module): self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations) self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device) - def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32): + def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32, embeds_info=[]): if embeds is not None: x = embeds + comfy.ops.cast_to(self.embeddings.position_embedding.weight, dtype=dtype, device=embeds.device) else: diff --git a/comfy/model_base.py b/comfy/model_base.py index 15bd7abef..6c861b15e 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1325,6 +1325,7 @@ class Omnigen2(BaseModel): class QwenImage(BaseModel): def __init__(self, model_config, model_type=ModelType.FLUX, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.qwen_image.model.QwenImageTransformer2DModel) + self.memory_usage_factor_conds = ("ref_latents",) def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) @@ -1342,3 +1343,10 @@ class QwenImage(BaseModel): if ref_latents_method is not None: out['ref_latents_method'] = comfy.conds.CONDConstant(ref_latents_method) return out + + def extra_conds_shapes(self, **kwargs): + out = {} + ref_latents = kwargs.get("reference_latents", None) + if ref_latents is not None: + out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16]) + return out diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index ade340fd1..1e8adbe69 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -204,17 +204,19 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): tokens_embed = self.transformer.get_input_embeddings()(tokens_embed, out_dtype=torch.float32) index = 0 pad_extra = 0 + embeds_info = [] for o in other_embeds: emb = o[1] if torch.is_tensor(emb): emb = {"type": "embedding", "data": emb} + extra = None emb_type = emb.get("type", None) if emb_type == "embedding": emb = emb.get("data", None) else: if hasattr(self.transformer, "preprocess_embed"): - emb = self.transformer.preprocess_embed(emb, device=device) + emb, extra = self.transformer.preprocess_embed(emb, device=device) else: emb = None @@ -229,6 +231,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): tokens_embed = torch.cat([tokens_embed[:, :ind], emb, tokens_embed[:, ind:]], dim=1) attention_mask = attention_mask[:ind] + [1] * emb_shape + attention_mask[ind:] index += emb_shape - 1 + embeds_info.append({"type": emb_type, "index": ind, "size": emb_shape, "extra": extra}) else: index += -1 pad_extra += emb_shape @@ -243,11 +246,11 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): attention_masks.append(attention_mask) num_tokens.append(sum(attention_mask)) - return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens + return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens, embeds_info def forward(self, tokens): device = self.transformer.get_input_embeddings().weight.device - embeds, attention_mask, num_tokens = self.process_tokens(tokens, device) + embeds, attention_mask, num_tokens, embeds_info = self.process_tokens(tokens, device) attention_mask_model = None if self.enable_attention_masks: @@ -258,7 +261,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): else: intermediate_output = self.layer_idx - outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32) + outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32, embeds_info=embeds_info) if self.layer == "last": z = outputs[0].float() diff --git a/comfy/text_encoders/bert.py b/comfy/text_encoders/bert.py index 551b03162..ed4638a9a 100644 --- a/comfy/text_encoders/bert.py +++ b/comfy/text_encoders/bert.py @@ -116,7 +116,7 @@ class BertModel_(torch.nn.Module): self.embeddings = BertEmbeddings(config_dict["vocab_size"], config_dict["max_position_embeddings"], config_dict["type_vocab_size"], config_dict["pad_token_id"], embed_dim, layer_norm_eps, dtype, device, operations) self.encoder = BertEncoder(config_dict["num_hidden_layers"], embed_dim, config_dict["intermediate_size"], config_dict["num_attention_heads"], layer_norm_eps, dtype, device, operations) - def forward(self, input_tokens, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None): + def forward(self, input_tokens, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]): x = self.embeddings(input_tokens, embeds=embeds, dtype=dtype) mask = None if attention_mask is not None: diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 1da6a0c94..9d90d5a61 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -2,12 +2,14 @@ import torch import torch.nn as nn from dataclasses import dataclass from typing import Optional, Any +import math from comfy.ldm.modules.attention import optimized_attention_for_device import comfy.model_management import comfy.ldm.common_dit import comfy.model_management +from . import qwen_vl @dataclass class Llama2Config: @@ -100,12 +102,10 @@ def rotate_half(x): return torch.cat((-x2, x1), dim=-1) -def precompute_freqs_cis(head_dim, seq_len, theta, device=None): +def precompute_freqs_cis(head_dim, position_ids, theta, device=None): theta_numerator = torch.arange(0, head_dim, 2, device=device).float() inv_freq = 1.0 / (theta ** (theta_numerator / head_dim)) - position_ids = torch.arange(0, seq_len, device=device).unsqueeze(0) - inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) position_ids_expanded = position_ids[:, None, :].float() freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) @@ -277,7 +277,7 @@ class Llama2_(nn.Module): self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype) - def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None): + def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[]): if embeds is not None: x = embeds else: @@ -286,8 +286,11 @@ class Llama2_(nn.Module): if self.normalize_in: x *= self.config.hidden_size ** 0.5 + if position_ids is None: + position_ids = torch.arange(0, x.shape[1], device=x.device).unsqueeze(0) + freqs_cis = precompute_freqs_cis(self.config.head_dim, - x.shape[1], + position_ids, self.config.rope_theta, device=x.device) @@ -372,8 +375,38 @@ class Qwen25_7BVLI(BaseLlama, torch.nn.Module): self.num_layers = config.num_hidden_layers self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) + self.visual = qwen_vl.Qwen2VLVisionTransformer(hidden_size=1280, output_hidden_size=config.hidden_size, device=device, dtype=dtype, ops=operations) self.dtype = dtype + def preprocess_embed(self, embed, device): + if embed["type"] == "image": + image, grid = qwen_vl.process_qwen2vl_images(embed["data"]) + return self.visual(image.to(device, dtype=torch.float32), grid), grid + return None, None + + def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]): + grid = None + for e in embeds_info: + if e.get("type") == "image": + grid = e.get("extra", None) + position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device) + start = e.get("index") + position_ids[:, :start] = torch.arange(0, start, device=embeds.device) + end = e.get("size") + start + len_max = int(grid.max()) // 2 + start_next = len_max + start + position_ids[:, end:] = torch.arange(start_next, start_next + (embeds.shape[1] - end), device=embeds.device) + position_ids[0, start:end] = start + max_d = int(grid[0][1]) // 2 + position_ids[1, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start] + max_d = int(grid[0][2]) // 2 + position_ids[2, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start] + + if grid is None: + position_ids = None + + return super().forward(x, attention_mask=attention_mask, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=final_layer_norm_intermediate, dtype=dtype, position_ids=position_ids) + class Gemma2_2B(BaseLlama, torch.nn.Module): def __init__(self, config_dict, dtype, device, operations): super().__init__() diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py index ce5c98097..f07318d6c 100644 --- a/comfy/text_encoders/qwen_image.py +++ b/comfy/text_encoders/qwen_image.py @@ -15,13 +15,27 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_7b", tokenizer=Qwen25_7BVLITokenizer) self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image \\(color, shape, size, texture, objects, background\\), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n" - def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None,**kwargs): + def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs): if llama_template is None: - llama_text = self.llama_template.format(text) + if len(images) > 0: + llama_text = self.llama_template_images.format(text) + else: + llama_text = self.llama_template.format(text) else: llama_text = llama_template.format(text) - return super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs) + tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs) + key_name = next(iter(tokens)) + embed_count = 0 + qwen_tokens = tokens[key_name] + for r in qwen_tokens: + for i in range(len(r)): + if r[i][0] == 151655: + if len(images) > embed_count: + r[i] = ({"type": "image", "data": images[embed_count], "original_type": "image"},) + r[i][1:] + embed_count += 1 + return tokens class Qwen25_7BVLIModel(sd1_clip.SDClipModel): diff --git a/comfy/text_encoders/qwen_vl.py b/comfy/text_encoders/qwen_vl.py new file mode 100644 index 000000000..3b18ce730 --- /dev/null +++ b/comfy/text_encoders/qwen_vl.py @@ -0,0 +1,428 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Optional, Tuple +import math +from comfy.ldm.modules.attention import optimized_attention_for_device + + +def process_qwen2vl_images( + images: torch.Tensor, + min_pixels: int = 3136, + max_pixels: int = 12845056, + patch_size: int = 14, + temporal_patch_size: int = 2, + merge_size: int = 2, + image_mean: list = None, + image_std: list = None, +): + if image_mean is None: + image_mean = [0.48145466, 0.4578275, 0.40821073] + if image_std is None: + image_std = [0.26862954, 0.26130258, 0.27577711] + + batch_size, height, width, channels = images.shape + device = images.device + # dtype = images.dtype + + images = images.permute(0, 3, 1, 2) + + grid_thw_list = [] + img = images[0] + + factor = patch_size * merge_size + + h_bar = round(height / factor) * factor + w_bar = round(width / factor) * factor + + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = max(factor, math.floor(height / beta / factor) * factor) + w_bar = max(factor, math.floor(width / beta / factor) * factor) + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = math.ceil(height * beta / factor) * factor + w_bar = math.ceil(width * beta / factor) * factor + + img_resized = F.interpolate( + img.unsqueeze(0), + size=(h_bar, w_bar), + mode='bilinear', + align_corners=False + ).squeeze(0) + + normalized = img_resized.clone() + for c in range(3): + normalized[c] = (img_resized[c] - image_mean[c]) / image_std[c] + + grid_h = h_bar // patch_size + grid_w = w_bar // patch_size + grid_thw = torch.tensor([1, grid_h, grid_w], device=device, dtype=torch.long) + + pixel_values = normalized + grid_thw_list.append(grid_thw) + image_grid_thw = torch.stack(grid_thw_list) + + grid_t = 1 + channel = pixel_values.shape[0] + pixel_values = pixel_values.unsqueeze(0).repeat(2, 1, 1, 1) + + patches = pixel_values.reshape( + grid_t, + temporal_patch_size, + channel, + grid_h // merge_size, + merge_size, + patch_size, + grid_w // merge_size, + merge_size, + patch_size, + ) + + patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8) + flatten_patches = patches.reshape( + grid_t * grid_h * grid_w, + channel * temporal_patch_size * patch_size * patch_size + ) + + return flatten_patches, image_grid_thw + + +class VisionPatchEmbed(nn.Module): + def __init__( + self, + patch_size: int = 14, + temporal_patch_size: int = 2, + in_channels: int = 3, + embed_dim: int = 3584, + device=None, + dtype=None, + ops=None, + ): + super().__init__() + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.in_channels = in_channels + self.embed_dim = embed_dim + + kernel_size = [temporal_patch_size, patch_size, patch_size] + self.proj = ops.Conv3d( + in_channels, + embed_dim, + kernel_size=kernel_size, + stride=kernel_size, + bias=False, + device=device, + dtype=dtype + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = hidden_states.view( + -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size + ) + hidden_states = self.proj(hidden_states) + return hidden_states.view(-1, self.embed_dim) + + +def rotate_half(x): + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb_vision(q, k, cos, sin): + cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float() + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class VisionRotaryEmbedding(nn.Module): + def __init__(self, dim: int, theta: float = 10000.0): + super().__init__() + self.dim = dim + self.theta = theta + + def forward(self, seqlen: int, device) -> torch.Tensor: + inv_freq = 1.0 / (self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float, device=device) / self.dim)) + seq = torch.arange(seqlen, device=inv_freq.device, dtype=inv_freq.dtype) + freqs = torch.outer(seq, inv_freq) + return freqs + + +class PatchMerger(nn.Module): + def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2, device=None, dtype=None, ops=None): + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size ** 2) + self.ln_q = ops.RMSNorm(context_dim, eps=1e-6, device=device, dtype=dtype) + self.mlp = nn.Sequential( + ops.Linear(self.hidden_size, self.hidden_size, device=device, dtype=dtype), + nn.GELU(), + ops.Linear(self.hidden_size, dim, device=device, dtype=dtype), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.ln_q(x).reshape(-1, self.hidden_size) + x = self.mlp(x) + return x + + +class VisionAttention(nn.Module): + def __init__(self, hidden_size: int, num_heads: int, device=None, dtype=None, ops=None): + super().__init__() + self.hidden_size = hidden_size + self.num_heads = num_heads + self.head_dim = hidden_size // num_heads + self.scaling = self.head_dim ** -0.5 + + self.qkv = ops.Linear(hidden_size, hidden_size * 3, bias=True, device=device, dtype=dtype) + self.proj = ops.Linear(hidden_size, hidden_size, bias=True, device=device, dtype=dtype) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + cu_seqlens=None, + optimized_attention=None, + ) -> torch.Tensor: + if hidden_states.dim() == 2: + seq_length, _ = hidden_states.shape + batch_size = 1 + hidden_states = hidden_states.unsqueeze(0) + else: + batch_size, seq_length, _ = hidden_states.shape + + qkv = self.qkv(hidden_states) + qkv = qkv.reshape(batch_size, seq_length, 3, self.num_heads, self.head_dim) + query_states, key_states, value_states = qkv.reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0) + + if position_embeddings is not None: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin) + + query_states = query_states.transpose(0, 1).unsqueeze(0) + key_states = key_states.transpose(0, 1).unsqueeze(0) + value_states = value_states.transpose(0, 1).unsqueeze(0) + + lengths = cu_seqlens[1:] - cu_seqlens[:-1] + splits = [ + torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states) + ] + + attn_outputs = [ + optimized_attention(q, k, v, self.num_heads, skip_reshape=True) + for q, k, v in zip(*splits) + ] + attn_output = torch.cat(attn_outputs, dim=1) + attn_output = attn_output.reshape(seq_length, -1) + attn_output = self.proj(attn_output) + + return attn_output + + +class VisionMLP(nn.Module): + def __init__(self, hidden_size: int, intermediate_size: int, device=None, dtype=None, ops=None): + super().__init__() + self.gate_proj = ops.Linear(hidden_size, intermediate_size, bias=True, device=device, dtype=dtype) + self.up_proj = ops.Linear(hidden_size, intermediate_size, bias=True, device=device, dtype=dtype) + self.down_proj = ops.Linear(intermediate_size, hidden_size, bias=True, device=device, dtype=dtype) + self.act_fn = nn.SiLU() + + def forward(self, hidden_state): + return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state)) + + +class VisionBlock(nn.Module): + def __init__(self, hidden_size: int, intermediate_size: int, num_heads: int, device=None, dtype=None, ops=None): + super().__init__() + self.norm1 = ops.RMSNorm(hidden_size, eps=1e-6, device=device, dtype=dtype) + self.norm2 = ops.RMSNorm(hidden_size, eps=1e-6, device=device, dtype=dtype) + self.attn = VisionAttention(hidden_size, num_heads, device=device, dtype=dtype, ops=ops) + self.mlp = VisionMLP(hidden_size, intermediate_size, device=device, dtype=dtype, ops=ops) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + cu_seqlens=None, + optimized_attention=None, + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.norm1(hidden_states) + hidden_states = self.attn(hidden_states, position_embeddings, cu_seqlens, optimized_attention) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class Qwen2VLVisionTransformer(nn.Module): + def __init__( + self, + hidden_size: int = 3584, + output_hidden_size: int = 3584, + intermediate_size: int = 3420, + num_heads: int = 16, + num_layers: int = 32, + patch_size: int = 14, + temporal_patch_size: int = 2, + spatial_merge_size: int = 2, + window_size: int = 112, + device=None, + dtype=None, + ops=None + ): + super().__init__() + self.hidden_size = hidden_size + self.patch_size = patch_size + self.spatial_merge_size = spatial_merge_size + self.window_size = window_size + self.fullatt_block_indexes = [7, 15, 23, 31] + + self.patch_embed = VisionPatchEmbed( + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + in_channels=3, + embed_dim=hidden_size, + device=device, + dtype=dtype, + ops=ops, + ) + + head_dim = hidden_size // num_heads + self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2) + + self.blocks = nn.ModuleList([ + VisionBlock(hidden_size, intermediate_size, num_heads, device, dtype, ops) + for _ in range(num_layers) + ]) + + self.merger = PatchMerger( + dim=output_hidden_size, + context_dim=hidden_size, + spatial_merge_size=spatial_merge_size, + device=device, + dtype=dtype, + ops=ops, + ) + + def get_window_index(self, grid_thw): + window_index = [] + cu_window_seqlens = [0] + window_index_id = 0 + vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size + + for grid_t, grid_h, grid_w in grid_thw: + llm_grid_h = grid_h // self.spatial_merge_size + llm_grid_w = grid_w // self.spatial_merge_size + + index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w) + + pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size + pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size + num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size + num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size + + index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100) + index_padded = index_padded.reshape( + grid_t, + num_windows_h, + vit_merger_window_size, + num_windows_w, + vit_merger_window_size, + ) + index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( + grid_t, + num_windows_h * num_windows_w, + vit_merger_window_size, + vit_merger_window_size, + ) + + seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) + index_padded = index_padded.reshape(-1) + index_new = index_padded[index_padded != -100] + window_index.append(index_new + window_index_id) + + cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_size * self.spatial_merge_size + cu_window_seqlens[-1] + cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) + window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() + + window_index = torch.cat(window_index, dim=0) + return window_index, cu_window_seqlens + + def get_position_embeddings(self, grid_thw, device): + pos_ids = [] + + for t, h, w in grid_thw: + hpos_ids = torch.arange(h, device=device).unsqueeze(1).expand(-1, w) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + hpos_ids = hpos_ids.permute(0, 2, 1, 3).flatten() + + wpos_ids = torch.arange(w, device=device).unsqueeze(0).expand(h, -1) + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ) + wpos_ids = wpos_ids.permute(0, 2, 1, 3).flatten() + + pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size, device) + return rotary_pos_emb_full[pos_ids].flatten(1) + + def forward( + self, + pixel_values: torch.Tensor, + image_grid_thw: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + optimized_attention = optimized_attention_for_device(pixel_values.device, mask=False, small_input=True) + + hidden_states = self.patch_embed(pixel_values) + + window_index, cu_window_seqlens = self.get_window_index(image_grid_thw) + cu_window_seqlens = torch.tensor(cu_window_seqlens, device=hidden_states.device) + cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) + + position_embeddings = self.get_position_embeddings(image_grid_thw, hidden_states.device) + + seq_len, _ = hidden_states.size() + spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size + + hidden_states = hidden_states.reshape(seq_len // spatial_merge_unit, spatial_merge_unit, -1) + hidden_states = hidden_states[window_index, :, :] + hidden_states = hidden_states.reshape(seq_len, -1) + + position_embeddings = position_embeddings.reshape(seq_len // spatial_merge_unit, spatial_merge_unit, -1) + position_embeddings = position_embeddings[window_index, :, :] + position_embeddings = position_embeddings.reshape(seq_len, -1) + position_embeddings = torch.cat((position_embeddings, position_embeddings), dim=-1) + position_embeddings = (position_embeddings.cos(), position_embeddings.sin()) + + cu_seqlens = torch.repeat_interleave(image_grid_thw[:, 1] * image_grid_thw[:, 2], image_grid_thw[:, 0]).cumsum( + dim=0, + dtype=torch.int32, + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + for i, block in enumerate(self.blocks): + if i in self.fullatt_block_indexes: + cu_seqlens_now = cu_seqlens + else: + cu_seqlens_now = cu_window_seqlens + hidden_states = block(hidden_states, position_embeddings, cu_seqlens_now, optimized_attention=optimized_attention) + + hidden_states = self.merger(hidden_states) + return hidden_states diff --git a/comfy/text_encoders/t5.py b/comfy/text_encoders/t5.py index 36bf35309..e8588992a 100644 --- a/comfy/text_encoders/t5.py +++ b/comfy/text_encoders/t5.py @@ -199,7 +199,7 @@ class T5Stack(torch.nn.Module): self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device, operations=operations) # self.dropout = nn.Dropout(config.dropout_rate) - def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None): + def forward(self, x, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]): mask = None if attention_mask is not None: mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]) diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py new file mode 100644 index 000000000..b5088fae2 --- /dev/null +++ b/comfy_extras/nodes_qwen.py @@ -0,0 +1,63 @@ +import node_helpers +import comfy.utils + +PREFERRED_QWENIMAGE_RESOLUTIONS = [ + (672, 1568), + (688, 1504), + (720, 1456), + (752, 1392), + (800, 1328), + (832, 1248), + (880, 1184), + (944, 1104), + (1024, 1024), + (1104, 944), + (1184, 880), + (1248, 832), + (1328, 800), + (1392, 752), + (1456, 720), + (1504, 688), + (1568, 672), +] + + +class TextEncodeQwenImageEdit: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "clip": ("CLIP", ), + "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), + }, + "optional": {"vae": ("VAE", ), + "image": ("IMAGE", ),}} + + RETURN_TYPES = ("CONDITIONING",) + FUNCTION = "encode" + + CATEGORY = "advanced/conditioning" + + def encode(self, clip, prompt, vae=None, image=None): + ref_latent = None + if image is None: + images = [] + else: + images = [image] + if vae is not None: + width = image.shape[2] + height = image.shape[1] + aspect_ratio = width / height + _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS) + image = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "lanczos", "center").movedim(1, -1) + ref_latent = vae.encode(image[:, :, :, :3]) + + tokens = clip.tokenize(prompt, images=images) + conditioning = clip.encode_from_tokens_scheduled(tokens) + if ref_latent is not None: + conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": [ref_latent]}, append=True) + return (conditioning, ) + + +NODE_CLASS_MAPPINGS = { + "TextEncodeQwenImageEdit": TextEncodeQwenImageEdit, +} diff --git a/nodes.py b/nodes.py index 860a236aa..b3fa9c51a 100644 --- a/nodes.py +++ b/nodes.py @@ -2321,6 +2321,7 @@ async def init_builtin_extra_nodes(): "nodes_edit_model.py", "nodes_tcfg.py", "nodes_context_windows.py", + "nodes_qwen.py", ] import_failed = [] From 36b5127fd3eee8eaf95ff7296a61269ed56d53c0 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 19 Aug 2025 23:28:07 +0300 Subject: [PATCH 154/759] api_nodes: add kling-v2-1 and v2-1-master (#9257) --- comfy_api_nodes/apis/__init__.py | 3 +++ comfy_api_nodes/nodes_kling.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index 54298e8a9..c6f91e9d6 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -1315,6 +1315,7 @@ class KlingTaskStatus(str, Enum): class KlingTextToVideoModelName(str, Enum): kling_v1 = 'kling-v1' kling_v1_6 = 'kling-v1-6' + kling_v2_1_master = 'kling-v2-1-master' class KlingVideoGenAspectRatio(str, Enum): @@ -1347,6 +1348,8 @@ class KlingVideoGenModelName(str, Enum): kling_v1_5 = 'kling-v1-5' kling_v1_6 = 'kling-v1-6' kling_v2_master = 'kling-v2-master' + kling_v2_1 = 'kling-v2-1' + kling_v2_1_master = 'kling-v2-1-master' class KlingVideoResult(BaseModel): diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 9d483bb0e..9fa390985 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -421,6 +421,8 @@ class KlingTextToVideoNode(KlingNodeBase): "pro mode / 10s duration / kling-v2-master": ("pro", "10", "kling-v2-master"), "standard mode / 5s duration / kling-v2-master": ("std", "5", "kling-v2-master"), "standard mode / 10s duration / kling-v2-master": ("std", "10", "kling-v2-master"), + "pro mode / 5s duration / kling-v2-1-master": ("pro", "5", "kling-v2-1-master"), + "pro mode / 10s duration / kling-v2-1-master": ("pro", "10", "kling-v2-1-master"), } @classmethod From f16a70ba670e11de549af188663a87c77c5bc0c2 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 19 Aug 2025 23:28:27 +0300 Subject: [PATCH 155/759] api_nodes: add MinimaxHailuoVideoNode node (#9262) --- comfy_api_nodes/apis/__init__.py | 13 ++- comfy_api_nodes/nodes_minimax.py | 185 ++++++++++++++++++++++++++++++- 2 files changed, 191 insertions(+), 7 deletions(-) diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index c6f91e9d6..7a09df55b 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -1623,13 +1623,14 @@ class MinimaxTaskResultResponse(BaseModel): task_id: str = Field(..., description='The task ID being queried.') -class Model(str, Enum): +class MiniMaxModel(str, Enum): T2V_01_Director = 'T2V-01-Director' I2V_01_Director = 'I2V-01-Director' S2V_01 = 'S2V-01' I2V_01 = 'I2V-01' I2V_01_live = 'I2V-01-live' T2V_01 = 'T2V-01' + Hailuo_02 = 'MiniMax-Hailuo-02' class SubjectReferenceItem(BaseModel): @@ -1651,7 +1652,7 @@ class MinimaxVideoGenerationRequest(BaseModel): None, description='URL or base64 encoding of the first frame image. Required when model is I2V-01, I2V-01-Director, or I2V-01-live.', ) - model: Model = Field( + model: MiniMaxModel = Field( ..., description='Required. ID of model. Options: T2V-01-Director, I2V-01-Director, S2V-01, I2V-01, I2V-01-live, T2V-01', ) @@ -1668,6 +1669,14 @@ class MinimaxVideoGenerationRequest(BaseModel): None, description='Only available when model is S2V-01. The model will generate a video based on the subject uploaded through this parameter.', ) + duration: Optional[int] = Field( + None, + description="The length of the output video in seconds." + ) + resolution: Optional[str] = Field( + None, + description="The dimensions of the video display. 1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels." + ) class MinimaxVideoGenerationResponse(BaseModel): diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index 58d2ed90c..bb3c9e710 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -1,3 +1,4 @@ +from inspect import cleandoc from typing import Union import logging import torch @@ -10,7 +11,7 @@ from comfy_api_nodes.apis import ( MinimaxFileRetrieveResponse, MinimaxTaskResultResponse, SubjectReferenceItem, - Model + MiniMaxModel ) from comfy_api_nodes.apis.client import ( ApiEndpoint, @@ -84,7 +85,6 @@ class MinimaxTextToVideoNode: FUNCTION = "generate_video" CATEGORY = "api node/video/MiniMax" API_NODE = True - OUTPUT_NODE = True async def generate_video( self, @@ -121,7 +121,7 @@ class MinimaxTextToVideoNode: response_model=MinimaxVideoGenerationResponse, ), request=MinimaxVideoGenerationRequest( - model=Model(model), + model=MiniMaxModel(model), prompt=prompt_text, callback_url=None, first_frame_image=image_url, @@ -251,7 +251,6 @@ class MinimaxImageToVideoNode(MinimaxTextToVideoNode): FUNCTION = "generate_video" CATEGORY = "api node/video/MiniMax" API_NODE = True - OUTPUT_NODE = True class MinimaxSubjectToVideoNode(MinimaxTextToVideoNode): @@ -313,7 +312,181 @@ class MinimaxSubjectToVideoNode(MinimaxTextToVideoNode): FUNCTION = "generate_video" CATEGORY = "api node/video/MiniMax" API_NODE = True - OUTPUT_NODE = True + + +class MinimaxHailuoVideoNode: + """Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.""" + + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "prompt_text": ( + "STRING", + { + "multiline": True, + "default": "", + "tooltip": "Text prompt to guide the video generation.", + }, + ), + }, + "optional": { + "seed": ( + IO.INT, + { + "default": 0, + "min": 0, + "max": 0xFFFFFFFFFFFFFFFF, + "control_after_generate": True, + "tooltip": "The random seed used for creating the noise.", + }, + ), + "first_frame_image": ( + IO.IMAGE, + { + "tooltip": "Optional image to use as the first frame to generate a video." + }, + ), + "prompt_optimizer": ( + IO.BOOLEAN, + { + "tooltip": "Optimize prompt to improve generation quality when needed.", + "default": True, + }, + ), + "duration": ( + IO.COMBO, + { + "tooltip": "The length of the output video in seconds.", + "default": 6, + "options": [6, 10], + }, + ), + "resolution": ( + IO.COMBO, + { + "tooltip": "The dimensions of the video display. " + "1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels.", + "default": "768P", + "options": ["768P", "1080P"], + }, + ), + }, + "hidden": { + "auth_token": "AUTH_TOKEN_COMFY_ORG", + "comfy_api_key": "API_KEY_COMFY_ORG", + "unique_id": "UNIQUE_ID", + }, + } + + RETURN_TYPES = ("VIDEO",) + DESCRIPTION = cleandoc(__doc__ or "") + FUNCTION = "generate_video" + CATEGORY = "api node/video/MiniMax" + API_NODE = True + + async def generate_video( + self, + prompt_text, + seed=0, + first_frame_image: torch.Tensor=None, # used for ImageToVideo + prompt_optimizer=True, + duration=6, + resolution="768P", + model="MiniMax-Hailuo-02", + unique_id: Union[str, None]=None, + **kwargs, + ): + if first_frame_image is None: + validate_string(prompt_text, field_name="prompt_text") + + if model == "MiniMax-Hailuo-02" and resolution.upper() == "1080P" and duration != 6: + raise Exception( + "When model is MiniMax-Hailuo-02 and resolution is 1080P, duration is limited to 6 seconds." + ) + + # upload image, if passed in + image_url = None + if first_frame_image is not None: + image_url = (await upload_images_to_comfyapi(first_frame_image, max_images=1, auth_kwargs=kwargs))[0] + + video_generate_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/minimax/video_generation", + method=HttpMethod.POST, + request_model=MinimaxVideoGenerationRequest, + response_model=MinimaxVideoGenerationResponse, + ), + request=MinimaxVideoGenerationRequest( + model=MiniMaxModel(model), + prompt=prompt_text, + callback_url=None, + first_frame_image=image_url, + prompt_optimizer=prompt_optimizer, + duration=duration, + resolution=resolution, + ), + auth_kwargs=kwargs, + ) + response = await video_generate_operation.execute() + + task_id = response.task_id + if not task_id: + raise Exception(f"MiniMax generation failed: {response.base_resp}") + + average_duration = 120 if resolution == "768P" else 240 + video_generate_operation = PollingOperation( + poll_endpoint=ApiEndpoint( + path="/proxy/minimax/query/video_generation", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=MinimaxTaskResultResponse, + query_params={"task_id": task_id}, + ), + completed_statuses=["Success"], + failed_statuses=["Fail"], + status_extractor=lambda x: x.status.value, + estimated_duration=average_duration, + node_id=unique_id, + auth_kwargs=kwargs, + ) + task_result = await video_generate_operation.execute() + + file_id = task_result.file_id + if file_id is None: + raise Exception("Request was not successful. Missing file ID.") + file_retrieve_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/minimax/files/retrieve", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=MinimaxFileRetrieveResponse, + query_params={"file_id": int(file_id)}, + ), + request=EmptyRequest(), + auth_kwargs=kwargs, + ) + file_result = await file_retrieve_operation.execute() + + file_url = file_result.file.download_url + if file_url is None: + raise Exception( + f"No video was found in the response. Full response: {file_result.model_dump()}" + ) + logging.info(f"Generated video URL: {file_url}") + if unique_id: + if hasattr(file_result.file, "backup_download_url"): + message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}" + else: + message = f"Result URL: {file_url}" + PromptServer.instance.send_progress_text(message, unique_id) + + video_io = await download_url_to_bytesio(file_url) + if video_io is None: + error_msg = f"Failed to download video from {file_url}" + logging.error(error_msg) + raise Exception(error_msg) + return (VideoFromFile(video_io),) # A dictionary that contains all nodes you want to export with their names @@ -322,6 +495,7 @@ NODE_CLASS_MAPPINGS = { "MinimaxTextToVideoNode": MinimaxTextToVideoNode, "MinimaxImageToVideoNode": MinimaxImageToVideoNode, # "MinimaxSubjectToVideoNode": MinimaxSubjectToVideoNode, + "MinimaxHailuoVideoNode": MinimaxHailuoVideoNode, } # A dictionary that contains the friendly/humanly readable titles for the nodes @@ -329,4 +503,5 @@ NODE_DISPLAY_NAME_MAPPINGS = { "MinimaxTextToVideoNode": "MiniMax Text to Video", "MinimaxImageToVideoNode": "MiniMax Image to Video", "MinimaxSubjectToVideoNode": "MiniMax Subject to Video", + "MinimaxHailuoVideoNode": "MiniMax Hailuo Video", } From 07a927517cfaf099fec3903e8973f758e62d65f9 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 19 Aug 2025 23:29:01 +0300 Subject: [PATCH 156/759] api_nodes: add GPT-5 series models (#9325) --- comfy_api_nodes/nodes_openai.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index cbff2b2d2..674c9ede0 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -80,6 +80,9 @@ class SupportedOpenAIModel(str, Enum): gpt_4_1 = "gpt-4.1" gpt_4_1_mini = "gpt-4.1-mini" gpt_4_1_nano = "gpt-4.1-nano" + gpt_5 = "gpt-5" + gpt_5_mini = "gpt-5-mini" + gpt_5_nano = "gpt-5-nano" class OpenAIDalle2(ComfyNodeABC): From d844d8b13bfd6a83b0a7d0491aa2978ac44a6158 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 19 Aug 2025 23:29:24 +0300 Subject: [PATCH 157/759] api_nodes: added release version of google's models (#9304) --- comfy_api_nodes/nodes_gemini.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 3751fb2a1..ba4167a50 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -46,6 +46,8 @@ class GeminiModel(str, Enum): gemini_2_5_pro_preview_05_06 = "gemini-2.5-pro-preview-05-06" gemini_2_5_flash_preview_04_17 = "gemini-2.5-flash-preview-04-17" + gemini_2_5_pro = "gemini-2.5-pro" + gemini_2_5_flash = "gemini-2.5-flash" def get_gemini_endpoint( @@ -97,7 +99,7 @@ class GeminiNode(ComfyNodeABC): { "tooltip": "The Gemini model to use for generating responses.", "options": [model.value for model in GeminiModel], - "default": GeminiModel.gemini_2_5_pro_preview_05_06.value, + "default": GeminiModel.gemini_2_5_pro.value, }, ), "seed": ( From 54d8fdbed0a7b171ab8cfb02e29a7e0dc5fe78fd Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 19 Aug 2025 23:30:06 +0300 Subject: [PATCH 158/759] feat(api-nodes): add Vidu Video nodes (#9368) --- comfy_api_nodes/nodes_vidu.py | 622 +++++++++++++++++++++++ comfy_api_nodes/util/validation_utils.py | 53 ++ nodes.py | 1 + 3 files changed, 676 insertions(+) create mode 100644 comfy_api_nodes/nodes_vidu.py diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py new file mode 100644 index 000000000..2f441948c --- /dev/null +++ b/comfy_api_nodes/nodes_vidu.py @@ -0,0 +1,622 @@ +import logging +from enum import Enum +from typing import Any, Callable, Optional, Literal, TypeVar +from typing_extensions import override + +import torch +from pydantic import BaseModel, Field + +from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api_nodes.util.validation_utils import ( + validate_aspect_ratio_closeness, + validate_image_dimensions, + validate_image_aspect_ratio_range, + get_number_of_images, +) +from comfy_api_nodes.apis.client import ( + ApiEndpoint, + HttpMethod, + SynchronousOperation, + PollingOperation, + EmptyRequest, +) +from comfy_api_nodes.apinode_utils import download_url_to_video_output, upload_images_to_comfyapi + + +VIDU_TEXT_TO_VIDEO = "/proxy/vidu/text2video" +VIDU_IMAGE_TO_VIDEO = "/proxy/vidu/img2video" +VIDU_REFERENCE_VIDEO = "/proxy/vidu/reference2video" +VIDU_START_END_VIDEO = "/proxy/vidu/start-end2video" +VIDU_GET_GENERATION_STATUS = "/proxy/vidu/tasks/%s/creations" + +R = TypeVar("R") + +class VideoModelName(str, Enum): + vidu_q1 = 'viduq1' + + +class AspectRatio(str, Enum): + r_16_9 = "16:9" + r_9_16 = "9:16" + r_1_1 = "1:1" + + +class Resolution(str, Enum): + r_1080p = "1080p" + + +class MovementAmplitude(str, Enum): + auto = "auto" + small = "small" + medium = "medium" + large = "large" + + +class TaskCreationRequest(BaseModel): + model: VideoModelName = VideoModelName.vidu_q1 + prompt: Optional[str] = Field(None, max_length=1500) + duration: Optional[Literal[5]] = 5 + seed: Optional[int] = Field(0, ge=0, le=2147483647) + aspect_ratio: Optional[AspectRatio] = AspectRatio.r_16_9 + resolution: Optional[Resolution] = Resolution.r_1080p + movement_amplitude: Optional[MovementAmplitude] = MovementAmplitude.auto + images: Optional[list[str]] = Field(None, description="Base64 encoded string or image URL") + + +class TaskStatus(str, Enum): + created = "created" + queueing = "queueing" + processing = "processing" + success = "success" + failed = "failed" + + +class TaskCreationResponse(BaseModel): + task_id: str = Field(...) + state: TaskStatus = Field(...) + created_at: str = Field(...) + code: Optional[int] = Field(None, description="Error code") + + +class TaskResult(BaseModel): + id: str = Field(..., description="Creation id") + url: str = Field(..., description="The URL of the generated results, valid for one hour") + cover_url: str = Field(..., description="The cover URL of the generated results, valid for one hour") + + +class TaskStatusResponse(BaseModel): + state: TaskStatus = Field(...) + err_code: Optional[str] = Field(None) + creations: list[TaskResult] = Field(..., description="Generated results") + + +async def poll_until_finished( + auth_kwargs: dict[str, str], + api_endpoint: ApiEndpoint[Any, R], + result_url_extractor: Optional[Callable[[R], str]] = None, + estimated_duration: Optional[int] = None, + node_id: Optional[str] = None, +) -> R: + return await PollingOperation( + poll_endpoint=api_endpoint, + completed_statuses=[TaskStatus.success.value], + failed_statuses=[TaskStatus.failed.value], + status_extractor=lambda response: response.state.value, + auth_kwargs=auth_kwargs, + result_url_extractor=result_url_extractor, + estimated_duration=estimated_duration, + node_id=node_id, + poll_interval=16.0, + max_poll_attempts=256, + ).execute() + + +def get_video_url_from_response(response) -> Optional[str]: + if response.creations: + return response.creations[0].url + return None + + +def get_video_from_response(response) -> TaskResult: + if not response.creations: + error_msg = f"Vidu request does not contain results. State: {response.state}, Error Code: {response.err_code}" + logging.info(error_msg) + raise RuntimeError(error_msg) + logging.info("Vidu task %s succeeded. Video URL: %s", response.creations[0].id, response.creations[0].url) + return response.creations[0] + + +async def execute_task( + vidu_endpoint: str, + auth_kwargs: Optional[dict[str, str]], + payload: TaskCreationRequest, + estimated_duration: int, + node_id: str, +) -> R: + response = await SynchronousOperation( + endpoint=ApiEndpoint( + path=vidu_endpoint, + method=HttpMethod.POST, + request_model=TaskCreationRequest, + response_model=TaskCreationResponse, + ), + request=payload, + auth_kwargs=auth_kwargs, + ).execute() + if response.state == TaskStatus.failed: + error_msg = f"Vidu request failed. Code: {response.code}" + logging.error(error_msg) + raise RuntimeError(error_msg) + return await poll_until_finished( + auth_kwargs, + ApiEndpoint( + path=VIDU_GET_GENERATION_STATUS % response.task_id, + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=TaskStatusResponse, + ), + result_url_extractor=get_video_url_from_response, + estimated_duration=estimated_duration, + node_id=node_id, + ) + + +class ViduTextToVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ViduTextToVideoNode", + display_name="Vidu Text To Video Generation", + category="api node/video/Vidu", + description="Generate video from text prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in VideoModelName], + default=VideoModelName.vidu_q1.value, + tooltip="Model name", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="A textual description for video generation", + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=5, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=[model.value for model in AspectRatio], + default=AspectRatio.r_16_9.value, + tooltip="The aspect ratio of the output video", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[model.value for model in Resolution], + default=Resolution.r_1080p.value, + tooltip="Supported values may vary by model & duration", + optional=True, + ), + comfy_io.Combo.Input( + "movement_amplitude", + options=[model.value for model in MovementAmplitude], + default=MovementAmplitude.auto.value, + tooltip="The movement amplitude of objects in the frame", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + duration: int, + seed: int, + aspect_ratio: str, + resolution: str, + movement_amplitude: str, + ) -> comfy_io.NodeOutput: + if not prompt: + raise ValueError("The prompt field is required and cannot be empty.") + payload = TaskCreationRequest( + model_name=model, + prompt=prompt, + duration=duration, + seed=seed, + aspect_ratio=aspect_ratio, + resolution=resolution, + movement_amplitude=movement_amplitude, + ) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + results = await execute_task(VIDU_TEXT_TO_VIDEO, auth, payload, 320, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + + +class ViduImageToVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ViduImageToVideoNode", + display_name="Vidu Image To Video Generation", + category="api node/video/Vidu", + description="Generate video from image and optional prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in VideoModelName], + default=VideoModelName.vidu_q1.value, + tooltip="Model name", + ), + comfy_io.Image.Input( + "image", + tooltip="An image to be used as the start frame of the generated video", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="A textual description for video generation", + optional=True, + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=5, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[model.value for model in Resolution], + default=Resolution.r_1080p.value, + tooltip="Supported values may vary by model & duration", + optional=True, + ), + comfy_io.Combo.Input( + "movement_amplitude", + options=[model.value for model in MovementAmplitude], + default=MovementAmplitude.auto.value, + tooltip="The movement amplitude of objects in the frame", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + image: torch.Tensor, + prompt: str, + duration: int, + seed: int, + resolution: str, + movement_amplitude: str, + ) -> comfy_io.NodeOutput: + if get_number_of_images(image) > 1: + raise ValueError("Only one input image is allowed.") + validate_image_aspect_ratio_range(image, (1, 4), (4, 1)) + payload = TaskCreationRequest( + model_name=model, + prompt=prompt, + duration=duration, + seed=seed, + resolution=resolution, + movement_amplitude=movement_amplitude, + ) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + payload.images = await upload_images_to_comfyapi( + image, + max_images=1, + mime_type="image/png", + auth_kwargs=auth, + ) + results = await execute_task(VIDU_IMAGE_TO_VIDEO, auth, payload, 120, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + + +class ViduReferenceVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ViduReferenceVideoNode", + display_name="Vidu Reference To Video Generation", + category="api node/video/Vidu", + description="Generate video from multiple images and prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in VideoModelName], + default=VideoModelName.vidu_q1.value, + tooltip="Model name", + ), + comfy_io.Image.Input( + "images", + tooltip="Images to use as references to generate a video with consistent subjects (max 7 images).", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="A textual description for video generation", + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=5, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=[model.value for model in AspectRatio], + default=AspectRatio.r_16_9.value, + tooltip="The aspect ratio of the output video", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[model.value for model in Resolution], + default=Resolution.r_1080p.value, + tooltip="Supported values may vary by model & duration", + optional=True, + ), + comfy_io.Combo.Input( + "movement_amplitude", + options=[model.value for model in MovementAmplitude], + default=MovementAmplitude.auto.value, + tooltip="The movement amplitude of objects in the frame", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + images: torch.Tensor, + prompt: str, + duration: int, + seed: int, + aspect_ratio: str, + resolution: str, + movement_amplitude: str, + ) -> comfy_io.NodeOutput: + if not prompt: + raise ValueError("The prompt field is required and cannot be empty.") + a = get_number_of_images(images) + if a > 7: + raise ValueError("Too many images, maximum allowed is 7.") + for image in images: + validate_image_aspect_ratio_range(image, (1, 4), (4, 1)) + validate_image_dimensions(image, min_width=128, min_height=128) + payload = TaskCreationRequest( + model_name=model, + prompt=prompt, + duration=duration, + seed=seed, + aspect_ratio=aspect_ratio, + resolution=resolution, + movement_amplitude=movement_amplitude, + ) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + payload.images = await upload_images_to_comfyapi( + images, + max_images=7, + mime_type="image/png", + auth_kwargs=auth, + ) + results = await execute_task(VIDU_REFERENCE_VIDEO, auth, payload, 120, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + + +class ViduStartEndToVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ViduStartEndToVideoNode", + display_name="Vidu Start End To Video Generation", + category="api node/video/Vidu", + description="Generate a video from start and end frames and a prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in VideoModelName], + default=VideoModelName.vidu_q1.value, + tooltip="Model name", + ), + comfy_io.Image.Input( + "first_frame", + tooltip="Start frame", + ), + comfy_io.Image.Input( + "end_frame", + tooltip="End frame", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="A textual description for video generation", + optional=True, + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=5, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[model.value for model in Resolution], + default=Resolution.r_1080p.value, + tooltip="Supported values may vary by model & duration", + optional=True, + ), + comfy_io.Combo.Input( + "movement_amplitude", + options=[model.value for model in MovementAmplitude], + default=MovementAmplitude.auto.value, + tooltip="The movement amplitude of objects in the frame", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + first_frame: torch.Tensor, + end_frame: torch.Tensor, + prompt: str, + duration: int, + seed: int, + resolution: str, + movement_amplitude: str, + ) -> comfy_io.NodeOutput: + validate_aspect_ratio_closeness(first_frame, end_frame, min_rel=0.8, max_rel=1.25, strict=False) + payload = TaskCreationRequest( + model_name=model, + prompt=prompt, + duration=duration, + seed=seed, + resolution=resolution, + movement_amplitude=movement_amplitude, + ) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + payload.images = [ + (await upload_images_to_comfyapi(frame, max_images=1, mime_type="image/png", auth_kwargs=auth))[0] + for frame in (first_frame, end_frame) + ] + results = await execute_task(VIDU_START_END_VIDEO, auth, payload, 96, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + + +class ViduExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + ViduTextToVideoNode, + ViduImageToVideoNode, + ViduReferenceVideoNode, + ViduStartEndToVideoNode, + ] + +async def comfy_entrypoint() -> ViduExtension: + return ViduExtension() diff --git a/comfy_api_nodes/util/validation_utils.py b/comfy_api_nodes/util/validation_utils.py index 031b9fbd3..606b794bf 100644 --- a/comfy_api_nodes/util/validation_utils.py +++ b/comfy_api_nodes/util/validation_utils.py @@ -53,6 +53,53 @@ def validate_image_aspect_ratio( ) +def validate_image_aspect_ratio_range( + image: torch.Tensor, + min_ratio: tuple[float, float], # e.g. (1, 4) + max_ratio: tuple[float, float], # e.g. (4, 1) + *, + strict: bool = True, # True -> (min, max); False -> [min, max] +) -> float: + a1, b1 = min_ratio + a2, b2 = max_ratio + if a1 <= 0 or b1 <= 0 or a2 <= 0 or b2 <= 0: + raise ValueError("Ratios must be positive, like (1, 4) or (4, 1).") + lo, hi = (a1 / b1), (a2 / b2) + if lo > hi: + lo, hi = hi, lo + a1, b1, a2, b2 = a2, b2, a1, b1 # swap only for error text + w, h = get_image_dimensions(image) + if w <= 0 or h <= 0: + raise ValueError(f"Invalid image dimensions: {w}x{h}") + ar = w / h + ok = (lo < ar < hi) if strict else (lo <= ar <= hi) + if not ok: + op = "<" if strict else "≤" + raise ValueError(f"Image aspect ratio {ar:.6g} is outside allowed range: {a1}:{b1} {op} ratio {op} {a2}:{b2}") + return ar + + +def validate_aspect_ratio_closeness( + start_img, + end_img, + min_rel: float, + max_rel: float, + *, + strict: bool = False, # True => exclusive, False => inclusive +) -> None: + w1, h1 = get_image_dimensions(start_img) + w2, h2 = get_image_dimensions(end_img) + if min(w1, h1, w2, h2) <= 0: + raise ValueError("Invalid image dimensions") + ar1 = w1 / h1 + ar2 = w2 / h2 + # Normalize so it is symmetric (no need to check both ar1/ar2 and ar2/ar1) + closeness = max(ar1, ar2) / min(ar1, ar2) + limit = max(max_rel, 1.0 / min_rel) # for 0.8..1.25 this is 1.25 + if (closeness >= limit) if strict else (closeness > limit): + raise ValueError(f"Aspect ratios must be close: start/end={ar1/ar2:.4f}, allowed range {min_rel}–{max_rel}.") + + def validate_video_dimensions( video: VideoInput, min_width: Optional[int] = None, @@ -98,3 +145,9 @@ def validate_video_duration( raise ValueError( f"Video duration must be at most {max_duration}s, got {duration}s" ) + + +def get_number_of_images(images): + if isinstance(images, torch.Tensor): + return images.shape[0] if images.ndim >= 4 else 1 + return len(images) diff --git a/nodes.py b/nodes.py index b3fa9c51a..35dda1b19 100644 --- a/nodes.py +++ b/nodes.py @@ -2351,6 +2351,7 @@ async def init_builtin_api_nodes(): "nodes_moonvalley.py", "nodes_rodin.py", "nodes_gemini.py", + "nodes_vidu.py", ] if not await load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"): From bddd69618bf4463209c3681babfcbebd9b9aed85 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 19 Aug 2025 13:49:01 -0700 Subject: [PATCH 159/759] Change the TextEncodeQwenImageEdit node to use logic closer to reference. (#9432) --- comfy_extras/nodes_qwen.py | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py index b5088fae2..fff89556f 100644 --- a/comfy_extras/nodes_qwen.py +++ b/comfy_extras/nodes_qwen.py @@ -1,25 +1,6 @@ import node_helpers import comfy.utils - -PREFERRED_QWENIMAGE_RESOLUTIONS = [ - (672, 1568), - (688, 1504), - (720, 1456), - (752, 1392), - (800, 1328), - (832, 1248), - (880, 1184), - (944, 1104), - (1024, 1024), - (1104, 944), - (1184, 880), - (1248, 832), - (1328, 800), - (1392, 752), - (1456, 720), - (1504, 688), - (1568, 672), -] +import math class TextEncodeQwenImageEdit: @@ -42,13 +23,17 @@ class TextEncodeQwenImageEdit: if image is None: images = [] else: - images = [image] + samples = image.movedim(-1, 1) + total = int(1024 * 1024) + + scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) + width = round(samples.shape[3] * scale_by) + height = round(samples.shape[2] * scale_by) + + s = comfy.utils.common_upscale(samples, width, height, "area", "disabled") + image = s.movedim(1, -1) + images = [image[:, :, :, :3]] if vae is not None: - width = image.shape[2] - height = image.shape[1] - aspect_ratio = width / height - _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS) - image = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "lanczos", "center").movedim(1, -1) ref_latent = vae.encode(image[:, :, :, :3]) tokens = clip.tokenize(prompt, images=images) From dfa791eb4bfcaac3eb9b2b33fa15ae5a25589bb8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 19 Aug 2025 17:47:42 -0700 Subject: [PATCH 160/759] Rope fix for qwen vl. (#9435) --- comfy/text_encoders/llama.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 9d90d5a61..4c976058f 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -27,6 +27,7 @@ class Llama2Config: rms_norm_add = False mlp_activation = "silu" qkv_bias = False + rope_dims = None @dataclass class Qwen25_3BConfig: @@ -44,6 +45,7 @@ class Qwen25_3BConfig: rms_norm_add = False mlp_activation = "silu" qkv_bias = True + rope_dims = None @dataclass class Qwen25_7BVLI_Config: @@ -61,6 +63,7 @@ class Qwen25_7BVLI_Config: rms_norm_add = False mlp_activation = "silu" qkv_bias = True + rope_dims = [16, 24, 24] @dataclass class Gemma2_2B_Config: @@ -78,6 +81,7 @@ class Gemma2_2B_Config: rms_norm_add = True mlp_activation = "gelu_pytorch_tanh" qkv_bias = False + rope_dims = None class RMSNorm(nn.Module): def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None): @@ -102,7 +106,7 @@ def rotate_half(x): return torch.cat((-x2, x1), dim=-1) -def precompute_freqs_cis(head_dim, position_ids, theta, device=None): +def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=None): theta_numerator = torch.arange(0, head_dim, 2, device=device).float() inv_freq = 1.0 / (theta ** (theta_numerator / head_dim)) @@ -112,12 +116,20 @@ def precompute_freqs_cis(head_dim, position_ids, theta, device=None): emb = torch.cat((freqs, freqs), dim=-1) cos = emb.cos() sin = emb.sin() + if rope_dims is not None and position_ids.shape[0] > 1: + mrope_section = rope_dims * 2 + cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) + sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) + else: + cos = cos.unsqueeze(1) + sin = sin.unsqueeze(1) + return (cos, sin) def apply_rope(xq, xk, freqs_cis): - cos = freqs_cis[0].unsqueeze(1) - sin = freqs_cis[1].unsqueeze(1) + cos = freqs_cis[0] + sin = freqs_cis[1] q_embed = (xq * cos) + (rotate_half(xq) * sin) k_embed = (xk * cos) + (rotate_half(xk) * sin) return q_embed, k_embed @@ -292,6 +304,7 @@ class Llama2_(nn.Module): freqs_cis = precompute_freqs_cis(self.config.head_dim, position_ids, self.config.rope_theta, + self.config.rope_dims, device=x.device) mask = None From 7cd2c4bd6ab20f35a6bb1b1f2252c3ea16da4777 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 19 Aug 2025 21:45:27 -0700 Subject: [PATCH 161/759] Qwen rotary embeddings should now match reference code. (#9437) --- comfy/ldm/qwen_image/model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index a3c726299..bf3940313 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -349,8 +349,8 @@ class QwenImageTransformer2DModel(nn.Module): img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) img_ids[:, :, 0] = img_ids[:, :, 1] + index - img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) + img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - (h_len // 2) + img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) - (w_len // 2) return hidden_states, repeat(img_ids, "h w c -> b (h w) c", b=bs), orig_shape def forward( @@ -396,7 +396,7 @@ class QwenImageTransformer2DModel(nn.Module): hidden_states = torch.cat([hidden_states, kontext], dim=1) img_ids = torch.cat([img_ids, kontext_ids], dim=1) - txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size), ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size))) + txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2)) txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) ids = torch.cat((txt_ids, img_ids), dim=1) image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) From 5a8f502db5889873ffa13132b603b7b6daac605a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 19 Aug 2025 22:08:11 -0700 Subject: [PATCH 162/759] Disable prompt weights for qwen. (#9438) --- comfy/sd1_clip.py | 5 ++++- comfy/text_encoders/qwen_image.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 1e8adbe69..f8a7c2a1b 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -534,7 +534,10 @@ class SDTokenizer: min_padding = tokenizer_options.get("{}_min_padding".format(self.embedding_key), self.min_padding) text = escape_important(text) - parsed_weights = token_weights(text, 1.0) + if kwargs.get("disable_weights", False): + parsed_weights = [(text, 1.0)] + else: + parsed_weights = token_weights(text, 1.0) # tokenize words tokens = [] diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py index f07318d6c..6646b1003 100644 --- a/comfy/text_encoders/qwen_image.py +++ b/comfy/text_encoders/qwen_image.py @@ -15,7 +15,7 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_7b", tokenizer=Qwen25_7BVLITokenizer) self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" - self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image \\(color, shape, size, texture, objects, background\\), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n" + self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n" def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs): if llama_template is None: @@ -25,7 +25,7 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer): llama_text = self.llama_template.format(text) else: llama_text = llama_template.format(text) - tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, **kwargs) + tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs) key_name = next(iter(tokens)) embed_count = 0 qwen_tokens = tokens[key_name] From 8d38ea3bbf7e77ed7e7aee401b157dab211c5307 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 19 Aug 2025 23:58:54 -0700 Subject: [PATCH 163/759] Fix bf16 precision issue with qwen image embeddings. (#9441) --- comfy/ldm/qwen_image/model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index bf3940313..49f66b90a 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -347,7 +347,7 @@ class QwenImageTransformer2DModel(nn.Module): h_offset = ((h_offset + (patch_size // 2)) // patch_size) w_offset = ((w_offset + (patch_size // 2)) // patch_size) - img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) + img_ids = torch.zeros((h_len, w_len, 3), device=x.device) img_ids[:, :, 0] = img_ids[:, :, 1] + index img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - (h_len // 2) img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) - (w_len // 2) @@ -397,9 +397,10 @@ class QwenImageTransformer2DModel(nn.Module): img_ids = torch.cat([img_ids, kontext_ids], dim=1) txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2)) - txt_ids = torch.linspace(txt_start, txt_start + context.shape[1], steps=context.shape[1], device=x.device, dtype=x.dtype).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) + txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) ids = torch.cat((txt_ids, img_ids), dim=1) image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) + del ids, txt_ids, img_ids hidden_states = self.img_in(hidden_states) encoder_hidden_states = self.txt_norm(encoder_hidden_states) From 2f52e8f05f2039dd67e9b9783b8397350a548b95 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 20 Aug 2025 15:15:09 +0800 Subject: [PATCH 164/759] Bump template to 0.1.62 (#9419) * Bump template to 0.1.61 * Bump template to 0.1.62 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c7a5c47ab..8d928d826 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.9 -comfyui-workflow-templates==0.1.60 +comfyui-workflow-templates==0.1.62 comfyui-embedded-docs==0.2.6 torch torchsde From 7139d6d93fc7b5481a69b687080bd36f7b531c46 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 20 Aug 2025 03:15:30 -0400 Subject: [PATCH 165/759] ComfyUI version 0.3.51 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 29ec07ca6..65f06cf37 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.50" +__version__ = "0.3.51" diff --git a/pyproject.toml b/pyproject.toml index 659b5730a..ecbf04303 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.50" +version = "0.3.51" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From fe01885acf892de636b1b2743903812099bd42e3 Mon Sep 17 00:00:00 2001 From: Harel Cain Date: Wed, 20 Aug 2025 09:33:10 +0200 Subject: [PATCH 166/759] LTXV: fix key frame noise mask dimensions for when real noise mask exists (#9425) --- comfy_extras/nodes_lt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index b5058667a..f82337a67 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -166,7 +166,7 @@ class LTXVAddGuide: negative = self.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors) mask = torch.full( - (noise_mask.shape[0], 1, guiding_latent.shape[2], 1, 1), + (noise_mask.shape[0], 1, guiding_latent.shape[2], noise_mask.shape[3], noise_mask.shape[4]), 1.0 - strength, dtype=noise_mask.dtype, device=noise_mask.device, From e73a9dbe30434280c69d852ea78cc4bf88bfd501 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 20 Aug 2025 14:34:13 -0700 Subject: [PATCH 167/759] Add that qwen edit model is supported to readme. (#9463) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index fa99a8cbe..79a8a8c79 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/) - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model) - [HiDream E1.1](https://comfyanonymous.github.io/ComfyUI_examples/hidream/#hidream-e11) + - [Qwen Image Edit](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/#edit-model) - Video Models - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/) - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/) From 0963493a9c3b6565f8537288a0fb90991391ec41 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 20 Aug 2025 19:26:37 -0700 Subject: [PATCH 168/759] Support for Qwen Diffsynth Controlnets canny and depth. (#9465) These are not real controlnets but actually a patch on the model so they will be treated as such. Put them in the models/model_patches/ folder. Use the new ModelPatchLoader and QwenImageDiffsynthControlnet nodes. --- comfy/ldm/qwen_image/model.py | 7 + comfy/model_management.py | 8 +- comfy/model_patcher.py | 27 ++++ comfy_api/latest/_io.py | 4 + comfy_extras/nodes_model_patch.py | 138 ++++++++++++++++++++ models/model_patches/put_model_patches_here | 0 nodes.py | 1 + 7 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 comfy_extras/nodes_model_patch.py create mode 100644 models/model_patches/put_model_patches_here diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 49f66b90a..2503583cb 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -416,6 +416,7 @@ class QwenImageTransformer2DModel(nn.Module): ) patches_replace = transformer_options.get("patches_replace", {}) + patches = transformer_options.get("patches", {}) blocks_replace = patches_replace.get("dit", {}) for i, block in enumerate(self.transformer_blocks): @@ -436,6 +437,12 @@ class QwenImageTransformer2DModel(nn.Module): image_rotary_emb=image_rotary_emb, ) + if "double_block" in patches: + for p in patches["double_block"]: + out = p({"img": hidden_states, "txt": encoder_hidden_states, "x": x, "block_index": i}) + hidden_states = out["img"] + encoder_hidden_states = out["txt"] + hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) diff --git a/comfy/model_management.py b/comfy/model_management.py index 2a9f18068..d08aee1fe 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -593,7 +593,13 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu else: minimum_memory_required = max(inference_memory, minimum_memory_required + extra_reserved_memory()) - models = set(models) + models_temp = set() + for m in models: + models_temp.add(m) + for mm in m.model_patches_models(): + models_temp.add(mm) + + models = models_temp models_to_load = [] diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 52e76b5f3..a944cb421 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -430,6 +430,9 @@ class ModelPatcher: def set_model_forward_timestep_embed_patch(self, patch): self.set_model_patch(patch, "forward_timestep_embed_patch") + def set_model_double_block_patch(self, patch): + self.set_model_patch(patch, "double_block") + def add_object_patch(self, name, obj): self.object_patches[name] = obj @@ -486,6 +489,30 @@ class ModelPatcher: if hasattr(wrap_func, "to"): self.model_options["model_function_wrapper"] = wrap_func.to(device) + def model_patches_models(self): + to = self.model_options["transformer_options"] + models = [] + if "patches" in to: + patches = to["patches"] + for name in patches: + patch_list = patches[name] + for i in range(len(patch_list)): + if hasattr(patch_list[i], "models"): + models += patch_list[i].models() + if "patches_replace" in to: + patches = to["patches_replace"] + for name in patches: + patch_list = patches[name] + for k in patch_list: + if hasattr(patch_list[k], "models"): + models += patch_list[k].models() + if "model_function_wrapper" in self.model_options: + wrap_func = self.model_options["model_function_wrapper"] + if hasattr(wrap_func, "models"): + models += wrap_func.models() + + return models + def model_dtype(self): if hasattr(self.model, "get_dtype"): return self.model.get_dtype() diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index ec1efb51d..a3a21facc 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -726,6 +726,10 @@ class SEGS(ComfyTypeIO): class AnyType(ComfyTypeIO): Type = Any +@comfytype(io_type="MODEL_PATCH") +class MODEL_PATCH(ComfyTypeIO): + Type = Any + @comfytype(io_type="COMFY_MULTITYPED_V3") class MultiType: Type = Any diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py new file mode 100644 index 000000000..bb239bc45 --- /dev/null +++ b/comfy_extras/nodes_model_patch.py @@ -0,0 +1,138 @@ +import torch +import folder_paths +import comfy.utils +import comfy.ops +import comfy.model_management +import comfy.ldm.common_dit +import comfy.latent_formats + + +class BlockWiseControlBlock(torch.nn.Module): + # [linear, gelu, linear] + def __init__(self, dim: int = 3072, device=None, dtype=None, operations=None): + super().__init__() + self.x_rms = operations.RMSNorm(dim, eps=1e-6) + self.y_rms = operations.RMSNorm(dim, eps=1e-6) + self.input_proj = operations.Linear(dim, dim) + self.act = torch.nn.GELU() + self.output_proj = operations.Linear(dim, dim) + + def forward(self, x, y): + x, y = self.x_rms(x), self.y_rms(y) + x = self.input_proj(x + y) + x = self.act(x) + x = self.output_proj(x) + return x + + +class QwenImageBlockWiseControlNet(torch.nn.Module): + def __init__( + self, + num_layers: int = 60, + in_dim: int = 64, + additional_in_dim: int = 0, + dim: int = 3072, + device=None, dtype=None, operations=None + ): + super().__init__() + self.img_in = operations.Linear(in_dim + additional_in_dim, dim, device=device, dtype=dtype) + self.controlnet_blocks = torch.nn.ModuleList( + [ + BlockWiseControlBlock(dim, device=device, dtype=dtype, operations=operations) + for _ in range(num_layers) + ] + ) + + def process_input_latent_image(self, latent_image): + latent_image = comfy.latent_formats.Wan21().process_in(latent_image) + patch_size = 2 + hidden_states = comfy.ldm.common_dit.pad_to_patch_size(latent_image, (1, patch_size, patch_size)) + orig_shape = hidden_states.shape + hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) + hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5) + hidden_states = hidden_states.reshape(orig_shape[0], (orig_shape[-2] // 2) * (orig_shape[-1] // 2), orig_shape[1] * 4) + return self.img_in(hidden_states) + + def control_block(self, img, controlnet_conditioning, block_id): + return self.controlnet_blocks[block_id](img, controlnet_conditioning) + + +class ModelPatchLoader: + @classmethod + def INPUT_TYPES(s): + return {"required": { "name": (folder_paths.get_filename_list("model_patches"), ), + }} + RETURN_TYPES = ("MODEL_PATCH",) + FUNCTION = "load_model_patch" + EXPERIMENTAL = True + + CATEGORY = "advanced/loaders" + + def load_model_patch(self, name): + model_patch_path = folder_paths.get_full_path_or_raise("model_patches", name) + sd = comfy.utils.load_torch_file(model_patch_path, safe_load=True) + dtype = comfy.utils.weight_dtype(sd) + # TODO: this node will work with more types of model patches + model = QwenImageBlockWiseControlNet(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + model.load_state_dict(sd) + model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device()) + return (model,) + + +class DiffSynthCnetPatch: + def __init__(self, model_patch, vae, image, strength): + self.encoded_image = model_patch.model.process_input_latent_image(vae.encode(image)) + self.model_patch = model_patch + self.vae = vae + self.image = image + self.strength = strength + + def __call__(self, kwargs): + x = kwargs.get("x") + img = kwargs.get("img") + block_index = kwargs.get("block_index") + if self.encoded_image is None or self.encoded_image.shape[1:] != img.shape[1:]: + spacial_compression = self.vae.spacial_compression_encode() + image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center") + loaded_models = comfy.model_management.loaded_models(only_currently_used=True) + self.encoded_image = self.model_patch.model.process_input_latent_image(self.vae.encode(image_scaled.movedim(1, -1))) + comfy.model_management.load_models_gpu(loaded_models) + + img = img + (self.model_patch.model.control_block(img, self.encoded_image.to(img.dtype), block_index) * self.strength) + kwargs['img'] = img + return kwargs + + def to(self, device_or_dtype): + if isinstance(device_or_dtype, torch.device): + self.encoded_image = self.encoded_image.to(device_or_dtype) + return self + + def models(self): + return [self.model_patch] + +class QwenImageDiffsynthControlnet: + @classmethod + def INPUT_TYPES(s): + return {"required": { "model": ("MODEL",), + "model_patch": ("MODEL_PATCH",), + "vae": ("VAE",), + "image": ("IMAGE",), + "strength": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}), + }} + RETURN_TYPES = ("MODEL",) + FUNCTION = "diffsynth_controlnet" + EXPERIMENTAL = True + + CATEGORY = "advanced/loaders/qwen" + + def diffsynth_controlnet(self, model, model_patch, vae, image, strength): + model_patched = model.clone() + image = image[:, :, :, :3] + model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength)) + return (model_patched,) + + +NODE_CLASS_MAPPINGS = { + "ModelPatchLoader": ModelPatchLoader, + "QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet, +} diff --git a/models/model_patches/put_model_patches_here b/models/model_patches/put_model_patches_here new file mode 100644 index 000000000..e69de29bb diff --git a/nodes.py b/nodes.py index 35dda1b19..9681750d3 100644 --- a/nodes.py +++ b/nodes.py @@ -2322,6 +2322,7 @@ async def init_builtin_extra_nodes(): "nodes_tcfg.py", "nodes_context_windows.py", "nodes_qwen.py", + "nodes_model_patch.py" ] import_failed = [] From 0737b7e0d245de20192064da4888debbef3241c2 Mon Sep 17 00:00:00 2001 From: saurabh-pingale Date: Thu, 21 Aug 2025 07:57:57 +0530 Subject: [PATCH 169/759] fix(userdata): catch invalid workflow filenames (#9434) (#9445) --- app/user_manager.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/app/user_manager.py b/app/user_manager.py index 0ec3e46ea..a2d376c0c 100644 --- a/app/user_manager.py +++ b/app/user_manager.py @@ -363,10 +363,17 @@ class UserManager(): if not overwrite and os.path.exists(path): return web.Response(status=409, text="File already exists") - body = await request.read() + try: + body = await request.read() - with open(path, "wb") as f: - f.write(body) + with open(path, "wb") as f: + f.write(body) + except OSError as e: + logging.warning(f"Error saving file '{path}': {e}") + return web.Response( + status=400, + reason="Invalid filename. Please avoid special characters like :\\/*?\"<>|" + ) user_path = self.get_request_user_filepath(request, None) if full_info: From 9fa1036f60b5264302072453be524aa55928bbaf Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 20 Aug 2025 20:09:35 -0700 Subject: [PATCH 170/759] Forgot this. (#9470) --- folder_paths.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/folder_paths.py b/folder_paths.py index 9ec952940..b34af39e8 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -46,6 +46,8 @@ folder_names_and_paths["photomaker"] = ([os.path.join(models_dir, "photomaker")] folder_names_and_paths["classifiers"] = ([os.path.join(models_dir, "classifiers")], {""}) +folder_names_and_paths["model_patches"] = ([os.path.join(models_dir, "model_patches")], supported_pt_extensions) + output_directory = os.path.join(base_path, "output") temp_directory = os.path.join(base_path, "temp") input_directory = os.path.join(base_path, "input") From 1b2de2642d38099acdde7c460d133d93e91074f0 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 20 Aug 2025 21:33:49 -0700 Subject: [PATCH 171/759] Support diffsynth inpaint controlnet (model patch). (#9471) --- comfy_extras/nodes_model_patch.py | 39 ++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index bb239bc45..3eaada9bc 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -35,6 +35,7 @@ class QwenImageBlockWiseControlNet(torch.nn.Module): device=None, dtype=None, operations=None ): super().__init__() + self.additional_in_dim = additional_in_dim self.img_in = operations.Linear(in_dim + additional_in_dim, dim, device=device, dtype=dtype) self.controlnet_blocks = torch.nn.ModuleList( [ @@ -44,7 +45,7 @@ class QwenImageBlockWiseControlNet(torch.nn.Module): ) def process_input_latent_image(self, latent_image): - latent_image = comfy.latent_formats.Wan21().process_in(latent_image) + latent_image[:, :16] = comfy.latent_formats.Wan21().process_in(latent_image[:, :16]) patch_size = 2 hidden_states = comfy.ldm.common_dit.pad_to_patch_size(latent_image, (1, patch_size, patch_size)) orig_shape = hidden_states.shape @@ -73,19 +74,33 @@ class ModelPatchLoader: sd = comfy.utils.load_torch_file(model_patch_path, safe_load=True) dtype = comfy.utils.weight_dtype(sd) # TODO: this node will work with more types of model patches - model = QwenImageBlockWiseControlNet(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + additional_in_dim = sd["img_in.weight"].shape[1] - 64 + model = QwenImageBlockWiseControlNet(additional_in_dim=additional_in_dim, device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) model.load_state_dict(sd) model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device()) return (model,) class DiffSynthCnetPatch: - def __init__(self, model_patch, vae, image, strength): - self.encoded_image = model_patch.model.process_input_latent_image(vae.encode(image)) + def __init__(self, model_patch, vae, image, strength, mask=None): self.model_patch = model_patch self.vae = vae self.image = image self.strength = strength + self.mask = mask + self.encoded_image = model_patch.model.process_input_latent_image(self.encode_latent_cond(image)) + + def encode_latent_cond(self, image): + latent_image = self.vae.encode(image) + if self.model_patch.model.additional_in_dim > 0: + if self.mask is None: + mask_ = torch.ones_like(latent_image)[:, :self.model_patch.model.additional_in_dim // 4] + else: + mask_ = comfy.utils.common_upscale(self.mask.mean(dim=1, keepdim=True), latent_image.shape[-1], latent_image.shape[-2], "bilinear", "none") + + return torch.cat([latent_image, mask_], dim=1) + else: + return latent_image def __call__(self, kwargs): x = kwargs.get("x") @@ -95,7 +110,7 @@ class DiffSynthCnetPatch: spacial_compression = self.vae.spacial_compression_encode() image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center") loaded_models = comfy.model_management.loaded_models(only_currently_used=True) - self.encoded_image = self.model_patch.model.process_input_latent_image(self.vae.encode(image_scaled.movedim(1, -1))) + self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1))) comfy.model_management.load_models_gpu(loaded_models) img = img + (self.model_patch.model.control_block(img, self.encoded_image.to(img.dtype), block_index) * self.strength) @@ -118,17 +133,25 @@ class QwenImageDiffsynthControlnet: "vae": ("VAE",), "image": ("IMAGE",), "strength": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}), - }} + }, + "optional": {"mask": ("MASK",)}} RETURN_TYPES = ("MODEL",) FUNCTION = "diffsynth_controlnet" EXPERIMENTAL = True CATEGORY = "advanced/loaders/qwen" - def diffsynth_controlnet(self, model, model_patch, vae, image, strength): + def diffsynth_controlnet(self, model, model_patch, vae, image, strength, mask=None): model_patched = model.clone() image = image[:, :, :, :3] - model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength)) + if mask is not None: + if mask.ndim == 3: + mask = mask.unsqueeze(1) + if mask.ndim == 4: + mask = mask.unsqueeze(2) + mask = 1.0 - mask + + model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength, mask)) return (model_patched,) From bc49106837b627eb657fc86f2e475770ac5ce68a Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 22 Aug 2025 05:03:57 +0300 Subject: [PATCH 172/759] convert String nodes to V3 schema (#9370) --- comfy_extras/nodes_string.py | 449 ++++++++++++++++++----------------- 1 file changed, 237 insertions(+), 212 deletions(-) diff --git a/comfy_extras/nodes_string.py b/comfy_extras/nodes_string.py index b1a8ceef0..571d89f62 100644 --- a/comfy_extras/nodes_string.py +++ b/comfy_extras/nodes_string.py @@ -1,77 +1,91 @@ import re +from typing_extensions import override -from comfy.comfy_types.node_typing import IO +from comfy_api.latest import ComfyExtension, io -class StringConcatenate(): + +class StringConcatenate(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string_a": (IO.STRING, {"multiline": True}), - "string_b": (IO.STRING, {"multiline": True}), - "delimiter": (IO.STRING, {"multiline": False, "default": ""}) - } - } + def define_schema(cls): + return io.Schema( + node_id="StringConcatenate", + display_name="Concatenate", + category="utils/string", + inputs=[ + io.String.Input("string_a", multiline=True), + io.String.Input("string_b", multiline=True), + io.String.Input("delimiter", multiline=False, default=""), + ], + outputs=[ + io.String.Output(), + ] + ) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string_a, string_b, delimiter, **kwargs): - return delimiter.join((string_a, string_b)), - -class StringSubstring(): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "start": (IO.INT, {}), - "end": (IO.INT, {}), - } - } + def execute(cls, string_a, string_b, delimiter): + return io.NodeOutput(delimiter.join((string_a, string_b))) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - def execute(self, string, start, end, **kwargs): - return string[start:end], - -class StringLength(): +class StringSubstring(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}) - } - } + def define_schema(cls): + return io.Schema( + node_id="StringSubstring", + display_name="Substring", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.Int.Input("start"), + io.Int.Input("end"), + ], + outputs=[ + io.String.Output(), + ] + ) - RETURN_TYPES = (IO.INT,) - RETURN_NAMES = ("length",) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string, **kwargs): - length = len(string) - - return length, - -class CaseConverter(): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "mode": (IO.COMBO, {"options": ["UPPERCASE", "lowercase", "Capitalize", "Title Case"]}) - } - } + def execute(cls, string, start, end): + return io.NodeOutput(string[start:end]) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - def execute(self, string, mode, **kwargs): +class StringLength(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="StringLength", + display_name="Length", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + ], + outputs=[ + io.Int.Output(display_name="length"), + ] + ) + + @classmethod + def execute(cls, string): + return io.NodeOutput(len(string)) + + +class CaseConverter(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="CaseConverter", + display_name="Case Converter", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.Combo.Input("mode", options=["UPPERCASE", "lowercase", "Capitalize", "Title Case"]), + ], + outputs=[ + io.String.Output(), + ] + ) + + @classmethod + def execute(cls, string, mode): if mode == "UPPERCASE": result = string.upper() elif mode == "lowercase": @@ -83,24 +97,27 @@ class CaseConverter(): else: result = string - return result, + return io.NodeOutput(result) -class StringTrim(): +class StringTrim(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "mode": (IO.COMBO, {"options": ["Both", "Left", "Right"]}) - } - } + def define_schema(cls): + return io.Schema( + node_id="StringTrim", + display_name="Trim", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.Combo.Input("mode", options=["Both", "Left", "Right"]), + ], + outputs=[ + io.String.Output(), + ] + ) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string, mode, **kwargs): + @classmethod + def execute(cls, string, mode): if mode == "Both": result = string.strip() elif mode == "Left": @@ -110,70 +127,78 @@ class StringTrim(): else: result = string - return result, + return io.NodeOutput(result) -class StringReplace(): + +class StringReplace(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "find": (IO.STRING, {"multiline": True}), - "replace": (IO.STRING, {"multiline": True}) - } - } + def define_schema(cls): + return io.Schema( + node_id="StringReplace", + display_name="Replace", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.String.Input("find", multiline=True), + io.String.Input("replace", multiline=True), + ], + outputs=[ + io.String.Output(), + ] + ) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string, find, replace, **kwargs): - result = string.replace(find, replace) - return result, - - -class StringContains(): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "substring": (IO.STRING, {"multiline": True}), - "case_sensitive": (IO.BOOLEAN, {"default": True}) - } - } + def execute(cls, string, find, replace): + return io.NodeOutput(string.replace(find, replace)) - RETURN_TYPES = (IO.BOOLEAN,) - RETURN_NAMES = ("contains",) - FUNCTION = "execute" - CATEGORY = "utils/string" - def execute(self, string, substring, case_sensitive, **kwargs): +class StringContains(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="StringContains", + display_name="Contains", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.String.Input("substring", multiline=True), + io.Boolean.Input("case_sensitive", default=True), + ], + outputs=[ + io.Boolean.Output(display_name="contains"), + ] + ) + + @classmethod + def execute(cls, string, substring, case_sensitive): if case_sensitive: contains = substring in string else: contains = substring.lower() in string.lower() - return contains, + return io.NodeOutput(contains) -class StringCompare(): +class StringCompare(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string_a": (IO.STRING, {"multiline": True}), - "string_b": (IO.STRING, {"multiline": True}), - "mode": (IO.COMBO, {"options": ["Starts With", "Ends With", "Equal"]}), - "case_sensitive": (IO.BOOLEAN, {"default": True}) - } - } + def define_schema(cls): + return io.Schema( + node_id="StringCompare", + display_name="Compare", + category="utils/string", + inputs=[ + io.String.Input("string_a", multiline=True), + io.String.Input("string_b", multiline=True), + io.Combo.Input("mode", options=["Starts With", "Ends With", "Equal"]), + io.Boolean.Input("case_sensitive", default=True), + ], + outputs=[ + io.Boolean.Output(), + ] + ) - RETURN_TYPES = (IO.BOOLEAN,) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string_a, string_b, mode, case_sensitive, **kwargs): + @classmethod + def execute(cls, string_a, string_b, mode, case_sensitive): if case_sensitive: a = string_a b = string_b @@ -182,31 +207,34 @@ class StringCompare(): b = string_b.lower() if mode == "Equal": - return a == b, + return io.NodeOutput(a == b) elif mode == "Starts With": - return a.startswith(b), + return io.NodeOutput(a.startswith(b)) elif mode == "Ends With": - return a.endswith(b), + return io.NodeOutput(a.endswith(b)) -class RegexMatch(): + +class RegexMatch(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "regex_pattern": (IO.STRING, {"multiline": True}), - "case_insensitive": (IO.BOOLEAN, {"default": True}), - "multiline": (IO.BOOLEAN, {"default": False}), - "dotall": (IO.BOOLEAN, {"default": False}) - } - } + def define_schema(cls): + return io.Schema( + node_id="RegexMatch", + display_name="Regex Match", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.String.Input("regex_pattern", multiline=True), + io.Boolean.Input("case_insensitive", default=True), + io.Boolean.Input("multiline", default=False), + io.Boolean.Input("dotall", default=False), + ], + outputs=[ + io.Boolean.Output(display_name="matches"), + ] + ) - RETURN_TYPES = (IO.BOOLEAN,) - RETURN_NAMES = ("matches",) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string, regex_pattern, case_insensitive, multiline, dotall, **kwargs): + @classmethod + def execute(cls, string, regex_pattern, case_insensitive, multiline, dotall): flags = 0 if case_insensitive: @@ -223,29 +251,32 @@ class RegexMatch(): except re.error: result = False - return result, + return io.NodeOutput(result) -class RegexExtract(): +class RegexExtract(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "regex_pattern": (IO.STRING, {"multiline": True}), - "mode": (IO.COMBO, {"options": ["First Match", "All Matches", "First Group", "All Groups"]}), - "case_insensitive": (IO.BOOLEAN, {"default": True}), - "multiline": (IO.BOOLEAN, {"default": False}), - "dotall": (IO.BOOLEAN, {"default": False}), - "group_index": (IO.INT, {"default": 1, "min": 0, "max": 100}) - } - } + def define_schema(cls): + return io.Schema( + node_id="RegexExtract", + display_name="Regex Extract", + category="utils/string", + inputs=[ + io.String.Input("string", multiline=True), + io.String.Input("regex_pattern", multiline=True), + io.Combo.Input("mode", options=["First Match", "All Matches", "First Group", "All Groups"]), + io.Boolean.Input("case_insensitive", default=True), + io.Boolean.Input("multiline", default=False), + io.Boolean.Input("dotall", default=False), + io.Int.Input("group_index", default=1, min=0, max=100), + ], + outputs=[ + io.String.Output(), + ] + ) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string, regex_pattern, mode, case_insensitive, multiline, dotall, group_index, **kwargs): + @classmethod + def execute(cls, string, regex_pattern, mode, case_insensitive, multiline, dotall, group_index): join_delimiter = "\n" flags = 0 @@ -294,32 +325,33 @@ class RegexExtract(): except re.error: result = "" - return result, + return io.NodeOutput(result) -class RegexReplace(): - DESCRIPTION = "Find and replace text using regex patterns." +class RegexReplace(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "string": (IO.STRING, {"multiline": True}), - "regex_pattern": (IO.STRING, {"multiline": True}), - "replace": (IO.STRING, {"multiline": True}), - }, - "optional": { - "case_insensitive": (IO.BOOLEAN, {"default": True}), - "multiline": (IO.BOOLEAN, {"default": False}), - "dotall": (IO.BOOLEAN, {"default": False, "tooltip": "When enabled, the dot (.) character will match any character including newline characters. When disabled, dots won't match newlines."}), - "count": (IO.INT, {"default": 0, "min": 0, "max": 100, "tooltip": "Maximum number of replacements to make. Set to 0 to replace all occurrences (default). Set to 1 to replace only the first match, 2 for the first two matches, etc."}), - } - } + def define_schema(cls): + return io.Schema( + node_id="RegexReplace", + display_name="Regex Replace", + category="utils/string", + description="Find and replace text using regex patterns.", + inputs=[ + io.String.Input("string", multiline=True), + io.String.Input("regex_pattern", multiline=True), + io.String.Input("replace", multiline=True), + io.Boolean.Input("case_insensitive", default=True, optional=True), + io.Boolean.Input("multiline", default=False, optional=True), + io.Boolean.Input("dotall", default=False, optional=True, tooltip="When enabled, the dot (.) character will match any character including newline characters. When disabled, dots won't match newlines."), + io.Int.Input("count", default=0, min=0, max=100, optional=True, tooltip="Maximum number of replacements to make. Set to 0 to replace all occurrences (default). Set to 1 to replace only the first match, 2 for the first two matches, etc."), + ], + outputs=[ + io.String.Output(), + ] + ) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/string" - - def execute(self, string, regex_pattern, replace, case_insensitive=True, multiline=False, dotall=False, count=0, **kwargs): + @classmethod + def execute(cls, string, regex_pattern, replace, case_insensitive=True, multiline=False, dotall=False, count=0): flags = 0 if case_insensitive: @@ -329,32 +361,25 @@ class RegexReplace(): if dotall: flags |= re.DOTALL result = re.sub(regex_pattern, replace, string, count=count, flags=flags) - return result, + return io.NodeOutput(result) -NODE_CLASS_MAPPINGS = { - "StringConcatenate": StringConcatenate, - "StringSubstring": StringSubstring, - "StringLength": StringLength, - "CaseConverter": CaseConverter, - "StringTrim": StringTrim, - "StringReplace": StringReplace, - "StringContains": StringContains, - "StringCompare": StringCompare, - "RegexMatch": RegexMatch, - "RegexExtract": RegexExtract, - "RegexReplace": RegexReplace, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "StringConcatenate": "Concatenate", - "StringSubstring": "Substring", - "StringLength": "Length", - "CaseConverter": "Case Converter", - "StringTrim": "Trim", - "StringReplace": "Replace", - "StringContains": "Contains", - "StringCompare": "Compare", - "RegexMatch": "Regex Match", - "RegexExtract": "Regex Extract", - "RegexReplace": "Regex Replace", -} +class StringExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + StringConcatenate, + StringSubstring, + StringLength, + CaseConverter, + StringTrim, + StringReplace, + StringContains, + StringCompare, + RegexMatch, + RegexExtract, + RegexReplace, + ] + +async def comfy_entrypoint() -> StringExtension: + return StringExtension() From bab08f40d10c8737c3424e35bbff873bcb2333bd Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 22 Aug 2025 05:05:36 +0300 Subject: [PATCH 173/759] v3 nodes (part a) (#9149) --- comfy_extras/nodes_ace.py | 80 +++++++----- comfy_extras/nodes_advanced_samplers.py | 88 +++++++------ comfy_extras/nodes_apg.py | 72 +++++++---- comfy_extras/nodes_attention_multiply.py | 154 ++++++++++++++--------- 4 files changed, 239 insertions(+), 155 deletions(-) diff --git a/comfy_extras/nodes_ace.py b/comfy_extras/nodes_ace.py index cbfec15a2..1409233c9 100644 --- a/comfy_extras/nodes_ace.py +++ b/comfy_extras/nodes_ace.py @@ -1,49 +1,63 @@ import torch +from typing_extensions import override + import comfy.model_management import node_helpers +from comfy_api.latest import ComfyExtension, io -class TextEncodeAceStepAudio: + +class TextEncodeAceStepAudio(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "tags": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "lyrics": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "lyrics_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="TextEncodeAceStepAudio", + category="conditioning", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("tags", multiline=True, dynamic_prompts=True), + io.String.Input("lyrics", multiline=True, dynamic_prompts=True), + io.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[io.Conditioning.Output()], + ) - CATEGORY = "conditioning" - - def encode(self, clip, tags, lyrics, lyrics_strength): + @classmethod + def execute(cls, clip, tags, lyrics, lyrics_strength) -> io.NodeOutput: tokens = clip.tokenize(tags, lyrics=lyrics) conditioning = clip.encode_from_tokens_scheduled(tokens) conditioning = node_helpers.conditioning_set_values(conditioning, {"lyrics_strength": lyrics_strength}) - return (conditioning, ) + return io.NodeOutput(conditioning) -class EmptyAceStepLatentAudio: - def __init__(self): - self.device = comfy.model_management.intermediate_device() +class EmptyAceStepLatentAudio(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="EmptyAceStepLatentAudio", + category="latent/audio", + inputs=[ + io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1), + io.Int.Input( + "batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch." + ), + ], + outputs=[io.Latent.Output()], + ) @classmethod - def INPUT_TYPES(s): - return {"required": {"seconds": ("FLOAT", {"default": 120.0, "min": 1.0, "max": 1000.0, "step": 0.1}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent images in the batch."}), - }} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" - - CATEGORY = "latent/audio" - - def generate(self, seconds, batch_size): + def execute(cls, seconds, batch_size) -> io.NodeOutput: length = int(seconds * 44100 / 512 / 8) - latent = torch.zeros([batch_size, 8, 16, length], device=self.device) - return ({"samples": latent, "type": "audio"}, ) + latent = torch.zeros([batch_size, 8, 16, length], device=comfy.model_management.intermediate_device()) + return io.NodeOutput({"samples": latent, "type": "audio"}) -NODE_CLASS_MAPPINGS = { - "TextEncodeAceStepAudio": TextEncodeAceStepAudio, - "EmptyAceStepLatentAudio": EmptyAceStepLatentAudio, -} +class AceExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TextEncodeAceStepAudio, + EmptyAceStepLatentAudio, + ] + +async def comfy_entrypoint() -> AceExtension: + return AceExtension() diff --git a/comfy_extras/nodes_advanced_samplers.py b/comfy_extras/nodes_advanced_samplers.py index 5fbb096fb..5532ffe6a 100644 --- a/comfy_extras/nodes_advanced_samplers.py +++ b/comfy_extras/nodes_advanced_samplers.py @@ -1,8 +1,13 @@ +import numpy as np +import torch +from tqdm.auto import trange +from typing_extensions import override + +import comfy.model_patcher import comfy.samplers import comfy.utils -import torch -import numpy as np -from tqdm.auto import trange +from comfy.k_diffusion.sampling import to_d +from comfy_api.latest import ComfyExtension, io @torch.no_grad() @@ -33,30 +38,29 @@ def sample_lcm_upscale(model, x, sigmas, extra_args=None, callback=None, disable return x -class SamplerLCMUpscale: - upscale_methods = ["bislerp", "nearest-exact", "bilinear", "area", "bicubic"] +class SamplerLCMUpscale(io.ComfyNode): + UPSCALE_METHODS = ["bislerp", "nearest-exact", "bilinear", "area", "bicubic"] @classmethod - def INPUT_TYPES(s): - return {"required": - {"scale_ratio": ("FLOAT", {"default": 1.0, "min": 0.1, "max": 20.0, "step": 0.01}), - "scale_steps": ("INT", {"default": -1, "min": -1, "max": 1000, "step": 1}), - "upscale_method": (s.upscale_methods,), - } - } - RETURN_TYPES = ("SAMPLER",) - CATEGORY = "sampling/custom_sampling/samplers" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="SamplerLCMUpscale", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Float.Input("scale_ratio", default=1.0, min=0.1, max=20.0, step=0.01), + io.Int.Input("scale_steps", default=-1, min=-1, max=1000, step=1), + io.Combo.Input("upscale_method", options=cls.UPSCALE_METHODS), + ], + outputs=[io.Sampler.Output()], + ) - FUNCTION = "get_sampler" - - def get_sampler(self, scale_ratio, scale_steps, upscale_method): + @classmethod + def execute(cls, scale_ratio, scale_steps, upscale_method) -> io.NodeOutput: if scale_steps < 0: scale_steps = None sampler = comfy.samplers.KSAMPLER(sample_lcm_upscale, extra_options={"total_upscale": scale_ratio, "upscale_steps": scale_steps, "upscale_method": upscale_method}) - return (sampler, ) + return io.NodeOutput(sampler) -from comfy.k_diffusion.sampling import to_d -import comfy.model_patcher @torch.no_grad() def sample_euler_pp(model, x, sigmas, extra_args=None, callback=None, disable=None): @@ -82,30 +86,36 @@ def sample_euler_pp(model, x, sigmas, extra_args=None, callback=None, disable=No return x -class SamplerEulerCFGpp: +class SamplerEulerCFGpp(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"version": (["regular", "alternative"],),} - } - RETURN_TYPES = ("SAMPLER",) - # CATEGORY = "sampling/custom_sampling/samplers" - CATEGORY = "_for_testing" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="SamplerEulerCFGpp", + display_name="SamplerEulerCFG++", + category="_for_testing", # "sampling/custom_sampling/samplers" + inputs=[ + io.Combo.Input("version", options=["regular", "alternative"]), + ], + outputs=[io.Sampler.Output()], + is_experimental=True, + ) - FUNCTION = "get_sampler" - - def get_sampler(self, version): + @classmethod + def execute(cls, version) -> io.NodeOutput: if version == "alternative": sampler = comfy.samplers.KSAMPLER(sample_euler_pp) else: sampler = comfy.samplers.ksampler("euler_cfg_pp") - return (sampler, ) + return io.NodeOutput(sampler) -NODE_CLASS_MAPPINGS = { - "SamplerLCMUpscale": SamplerLCMUpscale, - "SamplerEulerCFGpp": SamplerEulerCFGpp, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "SamplerEulerCFGpp": "SamplerEulerCFG++", -} +class AdvancedSamplersExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + SamplerLCMUpscale, + SamplerEulerCFGpp, + ] + +async def comfy_entrypoint() -> AdvancedSamplersExtension: + return AdvancedSamplersExtension() diff --git a/comfy_extras/nodes_apg.py b/comfy_extras/nodes_apg.py index 25b21b1b8..f27ae7da8 100644 --- a/comfy_extras/nodes_apg.py +++ b/comfy_extras/nodes_apg.py @@ -1,4 +1,8 @@ import torch +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io + def project(v0, v1): v1 = torch.nn.functional.normalize(v1, dim=[-1, -2, -3]) @@ -6,22 +10,45 @@ def project(v0, v1): v0_orthogonal = v0 - v0_parallel return v0_parallel, v0_orthogonal -class APG: +class APG(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ("MODEL",), - "eta": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01, "tooltip": "Controls the scale of the parallel guidance vector. Default CFG behavior at a setting of 1."}), - "norm_threshold": ("FLOAT", {"default": 5.0, "min": 0.0, "max": 50.0, "step": 0.1, "tooltip": "Normalize guidance vector to this value, normalization disable at a setting of 0."}), - "momentum": ("FLOAT", {"default": 0.0, "min": -5.0, "max": 1.0, "step": 0.01, "tooltip":"Controls a running average of guidance during diffusion, disabled at a setting of 0."}), - } - } - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" - CATEGORY = "sampling/custom_sampling" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="APG", + display_name="Adaptive Projected Guidance", + category="sampling/custom_sampling", + inputs=[ + io.Model.Input("model"), + io.Float.Input( + "eta", + default=1.0, + min=-10.0, + max=10.0, + step=0.01, + tooltip="Controls the scale of the parallel guidance vector. Default CFG behavior at a setting of 1.", + ), + io.Float.Input( + "norm_threshold", + default=5.0, + min=0.0, + max=50.0, + step=0.1, + tooltip="Normalize guidance vector to this value, normalization disable at a setting of 0.", + ), + io.Float.Input( + "momentum", + default=0.0, + min=-5.0, + max=1.0, + step=0.01, + tooltip="Controls a running average of guidance during diffusion, disabled at a setting of 0.", + ), + ], + outputs=[io.Model.Output()], + ) - def patch(self, model, eta, norm_threshold, momentum): + @classmethod + def execute(cls, model, eta, norm_threshold, momentum) -> io.NodeOutput: running_avg = 0 prev_sigma = None @@ -65,12 +92,15 @@ class APG: m = model.clone() m.set_model_sampler_pre_cfg_function(pre_cfg_function) - return (m,) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "APG": APG, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "APG": "Adaptive Projected Guidance", -} +class ApgExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + APG, + ] + +async def comfy_entrypoint() -> ApgExtension: + return ApgExtension() diff --git a/comfy_extras/nodes_attention_multiply.py b/comfy_extras/nodes_attention_multiply.py index 4747eb395..c0e494c2a 100644 --- a/comfy_extras/nodes_attention_multiply.py +++ b/comfy_extras/nodes_attention_multiply.py @@ -1,3 +1,7 @@ +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io + def attention_multiply(attn, model, q, k, v, out): m = model.clone() @@ -16,57 +20,71 @@ def attention_multiply(attn, model, q, k, v, out): return m -class UNetSelfAttentionMultiply: +class UNetSelfAttentionMultiply(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "q": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "k": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "v": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "out": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="UNetSelfAttentionMultiply", + category="_for_testing/attention_experiments", + inputs=[ + io.Model.Input("model"), + io.Float.Input("q", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("k", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("v", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("out", default=1.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[io.Model.Output()], + is_experimental=True, + ) - CATEGORY = "_for_testing/attention_experiments" - - def patch(self, model, q, k, v, out): + @classmethod + def execute(cls, model, q, k, v, out) -> io.NodeOutput: m = attention_multiply("attn1", model, q, k, v, out) - return (m, ) + return io.NodeOutput(m) -class UNetCrossAttentionMultiply: + +class UNetCrossAttentionMultiply(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "q": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "k": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "v": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "out": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="UNetCrossAttentionMultiply", + category="_for_testing/attention_experiments", + inputs=[ + io.Model.Input("model"), + io.Float.Input("q", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("k", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("v", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("out", default=1.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[io.Model.Output()], + is_experimental=True, + ) - CATEGORY = "_for_testing/attention_experiments" - - def patch(self, model, q, k, v, out): + @classmethod + def execute(cls, model, q, k, v, out) -> io.NodeOutput: m = attention_multiply("attn2", model, q, k, v, out) - return (m, ) + return io.NodeOutput(m) -class CLIPAttentionMultiply: + +class CLIPAttentionMultiply(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "clip": ("CLIP",), - "q": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "k": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "v": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "out": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("CLIP",) - FUNCTION = "patch" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="CLIPAttentionMultiply", + category="_for_testing/attention_experiments", + inputs=[ + io.Clip.Input("clip"), + io.Float.Input("q", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("k", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("v", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("out", default=1.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[io.Clip.Output()], + is_experimental=True, + ) - CATEGORY = "_for_testing/attention_experiments" - - def patch(self, clip, q, k, v, out): + @classmethod + def execute(cls, clip, q, k, v, out) -> io.NodeOutput: m = clip.clone() sd = m.patcher.model_state_dict() @@ -79,23 +97,28 @@ class CLIPAttentionMultiply: m.add_patches({key: (None,)}, 0.0, v) if key.endswith("self_attn.out_proj.weight") or key.endswith("self_attn.out_proj.bias"): m.add_patches({key: (None,)}, 0.0, out) - return (m, ) + return io.NodeOutput(m) -class UNetTemporalAttentionMultiply: + +class UNetTemporalAttentionMultiply(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "self_structural": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "self_temporal": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "cross_structural": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "cross_temporal": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="UNetTemporalAttentionMultiply", + category="_for_testing/attention_experiments", + inputs=[ + io.Model.Input("model"), + io.Float.Input("self_structural", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("self_temporal", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("cross_structural", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("cross_temporal", default=1.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[io.Model.Output()], + is_experimental=True, + ) - CATEGORY = "_for_testing/attention_experiments" - - def patch(self, model, self_structural, self_temporal, cross_structural, cross_temporal): + @classmethod + def execute(cls, model, self_structural, self_temporal, cross_structural, cross_temporal) -> io.NodeOutput: m = model.clone() sd = model.model_state_dict() @@ -110,11 +133,18 @@ class UNetTemporalAttentionMultiply: m.add_patches({k: (None,)}, 0.0, cross_temporal) else: m.add_patches({k: (None,)}, 0.0, cross_structural) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "UNetSelfAttentionMultiply": UNetSelfAttentionMultiply, - "UNetCrossAttentionMultiply": UNetCrossAttentionMultiply, - "CLIPAttentionMultiply": CLIPAttentionMultiply, - "UNetTemporalAttentionMultiply": UNetTemporalAttentionMultiply, -} + +class AttentionMultiplyExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + UNetSelfAttentionMultiply, + UNetCrossAttentionMultiply, + CLIPAttentionMultiply, + UNetTemporalAttentionMultiply, + ] + +async def comfy_entrypoint() -> AttentionMultiplyExtension: + return AttentionMultiplyExtension() From eb39019daae96128ee848d0b7837ede299518a7c Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 22 Aug 2025 05:06:13 +0300 Subject: [PATCH 174/759] [V3] convert Google Veo API node to the V3 schema (#9272) * convert Google Veo API node to the V3 schema * use own full io.Schema for Veo3VideoGenerationNode * fixed typo * use auth_kwargs instead of auth_token/comfy_api_key --- comfy_api_nodes/nodes_veo2.py | 332 +++++++++++++++++++--------------- 1 file changed, 190 insertions(+), 142 deletions(-) diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index e25dab2f5..251aecd42 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -1,17 +1,18 @@ -import io import logging import base64 import aiohttp import torch +from io import BytesIO from typing import Optional +from typing_extensions import override -from comfy.comfy_types.node_typing import IO, ComfyNodeABC +from comfy_api.latest import ComfyExtension, io as comfy_io from comfy_api.input_impl.video_types import VideoFromFile from comfy_api_nodes.apis import ( VeoGenVidRequest, VeoGenVidResponse, VeoGenVidPollRequest, - VeoGenVidPollResponse + VeoGenVidPollResponse, ) from comfy_api_nodes.apis.client import ( ApiEndpoint, @@ -22,7 +23,7 @@ from comfy_api_nodes.apis.client import ( from comfy_api_nodes.apinode_utils import ( downscale_image_tensor, - tensor_to_base64_string + tensor_to_base64_string, ) AVERAGE_DURATION_VIDEO_GEN = 32 @@ -50,7 +51,7 @@ def get_video_url_from_response(poll_response: VeoGenVidPollResponse) -> Optiona return None -class VeoVideoGenerationNode(ComfyNodeABC): +class VeoVideoGenerationNode(comfy_io.ComfyNode): """ Generates videos from text prompts using Google's Veo API. @@ -59,101 +60,93 @@ class VeoVideoGenerationNode(ComfyNodeABC): """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text description of the video", - }, + def define_schema(cls): + return comfy_io.Schema( + node_id="VeoVideoGenerationNode", + display_name="Google Veo 2 Video Generation", + category="api node/video/Veo", + description="Generates videos from text prompts using Google's Veo 2 API", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text description of the video", ), - "aspect_ratio": ( - IO.COMBO, - { - "options": ["16:9", "9:16"], - "default": "16:9", - "tooltip": "Aspect ratio of the output video", - }, + comfy_io.Combo.Input( + "aspect_ratio", + options=["16:9", "9:16"], + default="16:9", + tooltip="Aspect ratio of the output video", ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Negative text prompt to guide what to avoid in the video", - }, + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative text prompt to guide what to avoid in the video", + optional=True, ), - "duration_seconds": ( - IO.INT, - { - "default": 5, - "min": 5, - "max": 8, - "step": 1, - "display": "number", - "tooltip": "Duration of the output video in seconds", - }, + comfy_io.Int.Input( + "duration_seconds", + default=5, + min=5, + max=8, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds", + optional=True, ), - "enhance_prompt": ( - IO.BOOLEAN, - { - "default": True, - "tooltip": "Whether to enhance the prompt with AI assistance", - } + comfy_io.Boolean.Input( + "enhance_prompt", + default=True, + tooltip="Whether to enhance the prompt with AI assistance", + optional=True, ), - "person_generation": ( - IO.COMBO, - { - "options": ["ALLOW", "BLOCK"], - "default": "ALLOW", - "tooltip": "Whether to allow generating people in the video", - }, + comfy_io.Combo.Input( + "person_generation", + options=["ALLOW", "BLOCK"], + default="ALLOW", + tooltip="Whether to allow generating people in the video", + optional=True, ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFF, - "step": 1, - "display": "number", - "control_after_generate": True, - "tooltip": "Seed for video generation (0 for random)", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFF, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, ), - "image": (IO.IMAGE, { - "default": None, - "tooltip": "Optional reference image to guide video generation", - }), - "model": ( - IO.COMBO, - { - "options": ["veo-2.0-generate-001"], - "default": "veo-2.0-generate-001", - "tooltip": "Veo 2 model to use for video generation", - }, + comfy_io.Image.Input( + "image", + tooltip="Optional reference image to guide video generation", + optional=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + comfy_io.Combo.Input( + "model", + options=["veo-2.0-generate-001"], + default="veo-2.0-generate-001", + tooltip="Veo 2 model to use for video generation", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - RETURN_TYPES = (IO.VIDEO,) - FUNCTION = "generate_video" - CATEGORY = "api node/video/Veo" - DESCRIPTION = "Generates videos from text prompts using Google's Veo 2 API" - API_NODE = True - - async def generate_video( - self, + @classmethod + async def execute( + cls, prompt, aspect_ratio="16:9", negative_prompt="", @@ -164,8 +157,6 @@ class VeoVideoGenerationNode(ComfyNodeABC): image=None, model="veo-2.0-generate-001", generate_audio=False, - unique_id: Optional[str] = None, - **kwargs, ): # Prepare the instances for the request instances = [] @@ -202,6 +193,10 @@ class VeoVideoGenerationNode(ComfyNodeABC): if "veo-3.0" in model: parameters["generateAudio"] = generate_audio + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } # Initial request to start video generation initial_operation = SynchronousOperation( endpoint=ApiEndpoint( @@ -214,7 +209,7 @@ class VeoVideoGenerationNode(ComfyNodeABC): instances=instances, parameters=parameters ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) initial_response = await initial_operation.execute() @@ -248,10 +243,10 @@ class VeoVideoGenerationNode(ComfyNodeABC): request=VeoGenVidPollRequest( operationName=operation_name ), - auth_kwargs=kwargs, + auth_kwargs=auth, poll_interval=5.0, result_url_extractor=get_video_url_from_response, - node_id=unique_id, + node_id=cls.hidden.unique_id, estimated_duration=AVERAGE_DURATION_VIDEO_GEN, ) @@ -304,10 +299,10 @@ class VeoVideoGenerationNode(ComfyNodeABC): logging.info("Video generation completed successfully") # Convert video data to BytesIO object - video_io = io.BytesIO(video_data) + video_io = BytesIO(video_data) # Return VideoFromFile object - return (VideoFromFile(video_io),) + return comfy_io.NodeOutput(VideoFromFile(video_io)) class Veo3VideoGenerationNode(VeoVideoGenerationNode): @@ -323,51 +318,104 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): """ @classmethod - def INPUT_TYPES(s): - parent_input = super().INPUT_TYPES() - - # Update model options for Veo 3 - parent_input["optional"]["model"] = ( - IO.COMBO, - { - "options": ["veo-3.0-generate-001", "veo-3.0-fast-generate-001"], - "default": "veo-3.0-generate-001", - "tooltip": "Veo 3 model to use for video generation", - }, + def define_schema(cls): + return comfy_io.Schema( + node_id="Veo3VideoGenerationNode", + display_name="Google Veo 3 Video Generation", + category="api node/video/Veo", + description="Generates videos from text prompts using Google's Veo 3 API", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text description of the video", + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=["16:9", "9:16"], + default="16:9", + tooltip="Aspect ratio of the output video", + ), + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative text prompt to guide what to avoid in the video", + optional=True, + ), + comfy_io.Int.Input( + "duration_seconds", + default=8, + min=8, + max=8, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Duration of the output video in seconds (Veo 3 only supports 8 seconds)", + optional=True, + ), + comfy_io.Boolean.Input( + "enhance_prompt", + default=True, + tooltip="Whether to enhance the prompt with AI assistance", + optional=True, + ), + comfy_io.Combo.Input( + "person_generation", + options=["ALLOW", "BLOCK"], + default="ALLOW", + tooltip="Whether to allow generating people in the video", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFF, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation (0 for random)", + optional=True, + ), + comfy_io.Image.Input( + "image", + tooltip="Optional reference image to guide video generation", + optional=True, + ), + comfy_io.Combo.Input( + "model", + options=["veo-3.0-generate-001", "veo-3.0-fast-generate-001"], + default="veo-3.0-generate-001", + tooltip="Veo 3 model to use for video generation", + optional=True, + ), + comfy_io.Boolean.Input( + "generate_audio", + default=False, + tooltip="Generate audio for the video. Supported by all Veo 3 models.", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) - # Add generateAudio parameter - parent_input["optional"]["generate_audio"] = ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Generate audio for the video. Supported by all Veo 3 models.", - } - ) - # Update duration constraints for Veo 3 (only 8 seconds supported) - parent_input["optional"]["duration_seconds"] = ( - IO.INT, - { - "default": 8, - "min": 8, - "max": 8, - "step": 1, - "display": "number", - "tooltip": "Duration of the output video in seconds (Veo 3 only supports 8 seconds)", - }, - ) +class VeoExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + VeoVideoGenerationNode, + Veo3VideoGenerationNode, + ] - return parent_input - - -# Register the nodes -NODE_CLASS_MAPPINGS = { - "VeoVideoGenerationNode": VeoVideoGenerationNode, - "Veo3VideoGenerationNode": Veo3VideoGenerationNode, -} - -NODE_DISPLAY_NAME_MAPPINGS = { - "VeoVideoGenerationNode": "Google Veo 2 Video Generation", - "Veo3VideoGenerationNode": "Google Veo 3 Video Generation", -} +async def comfy_entrypoint() -> VeoExtension: + return VeoExtension() From 7ed73d12d13c2c389e0469c46c2db635a7d74278 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 22 Aug 2025 05:06:51 +0300 Subject: [PATCH 175/759] [V3] convert Ideogram API nodes to the V3 schema (#9278) * convert Ideogram API nodes to the V3 schema * use auth_kwargs instead of auth_token/comfy_api_key --- comfy_api_nodes/nodes_ideogram.py | 536 ++++++++++++++---------------- 1 file changed, 257 insertions(+), 279 deletions(-) diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index db24e6da4..d28895f3e 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -1,8 +1,8 @@ -from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeDict -from inspect import cleandoc +from io import BytesIO +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io as comfy_io from PIL import Image import numpy as np -import io import torch from comfy_api_nodes.apis import ( IdeogramGenerateRequest, @@ -246,90 +246,81 @@ def display_image_urls_on_node(image_urls, node_id): PromptServer.instance.send_progress_text(urls_text, node_id) -class IdeogramV1(ComfyNodeABC): - """ - Generates images using the Ideogram V1 model. - """ - - def __init__(self): - pass +class IdeogramV1(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls): + return comfy_io.Schema( + node_id="IdeogramV1", + display_name="Ideogram V1", + category="api node/image/Ideogram", + description="Generates images using the Ideogram V1 model.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "turbo": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to use turbo mode (faster generation, potentially lower quality)", - } + comfy_io.Boolean.Input( + "turbo", + default=False, + tooltip="Whether to use turbo mode (faster generation, potentially lower quality)", ), - }, - "optional": { - "aspect_ratio": ( - IO.COMBO, - { - "options": list(V1_V2_RATIO_MAP.keys()), - "default": "1:1", - "tooltip": "The aspect ratio for image generation.", - }, + comfy_io.Combo.Input( + "aspect_ratio", + options=list(V1_V2_RATIO_MAP.keys()), + default="1:1", + tooltip="The aspect ratio for image generation.", + optional=True, ), - "magic_prompt_option": ( - IO.COMBO, - { - "options": ["AUTO", "ON", "OFF"], - "default": "AUTO", - "tooltip": "Determine if MagicPrompt should be used in generation", - }, + comfy_io.Combo.Input( + "magic_prompt_option", + options=["AUTO", "ON", "OFF"], + default="AUTO", + tooltip="Determine if MagicPrompt should be used in generation", + optional=True, ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2147483647, - "step": 1, - "control_after_generate": True, - "display": "number", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + control_after_generate=True, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), - "negative_prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Description of what to exclude from the image", - }, + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Description of what to exclude from the image", + optional=True, ), - "num_images": ( - IO.INT, - {"default": 1, "min": 1, "max": 8, "step": 1, "display": "number"}, + comfy_io.Int.Input( + "num_images", + default=1, + min=1, + max=8, + step=1, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + ) - RETURN_TYPES = (IO.IMAGE,) - FUNCTION = "api_call" - CATEGORY = "api node/image/Ideogram" - DESCRIPTION = cleandoc(__doc__ or "") - API_NODE = True - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt, turbo=False, aspect_ratio="1:1", @@ -337,13 +328,15 @@ class IdeogramV1(ComfyNodeABC): seed=0, negative_prompt="", num_images=1, - unique_id=None, - **kwargs, ): # Determine the model based on turbo setting aspect_ratio = V1_V2_RATIO_MAP.get(aspect_ratio, None) model = "V_1_TURBO" if turbo else "V_1" + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/ideogram/generate", @@ -364,7 +357,7 @@ class IdeogramV1(ComfyNodeABC): negative_prompt=negative_prompt if negative_prompt else None, ) ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) response = await operation.execute() @@ -377,93 +370,85 @@ class IdeogramV1(ComfyNodeABC): if not image_urls: raise Exception("No image URLs were generated in the response") - display_image_urls_on_node(image_urls, unique_id) - return (await download_and_process_images(image_urls),) + display_image_urls_on_node(image_urls, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_and_process_images(image_urls)) -class IdeogramV2(ComfyNodeABC): - """ - Generates images using the Ideogram V2 model. - """ - - def __init__(self): - pass +class IdeogramV2(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls): + return comfy_io.Schema( + node_id="IdeogramV2", + display_name="Ideogram V2", + category="api node/image/Ideogram", + description="Generates images using the Ideogram V2 model.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "turbo": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to use turbo mode (faster generation, potentially lower quality)", - } + comfy_io.Boolean.Input( + "turbo", + default=False, + tooltip="Whether to use turbo mode (faster generation, potentially lower quality)", ), - }, - "optional": { - "aspect_ratio": ( - IO.COMBO, - { - "options": list(V1_V2_RATIO_MAP.keys()), - "default": "1:1", - "tooltip": "The aspect ratio for image generation. Ignored if resolution is not set to AUTO.", - }, + comfy_io.Combo.Input( + "aspect_ratio", + options=list(V1_V2_RATIO_MAP.keys()), + default="1:1", + tooltip="The aspect ratio for image generation. Ignored if resolution is not set to AUTO.", + optional=True, ), - "resolution": ( - IO.COMBO, - { - "options": list(V1_V1_RES_MAP.keys()), - "default": "Auto", - "tooltip": "The resolution for image generation. If not set to AUTO, this overrides the aspect_ratio setting.", - }, + comfy_io.Combo.Input( + "resolution", + options=list(V1_V1_RES_MAP.keys()), + default="Auto", + tooltip="The resolution for image generation. " + "If not set to AUTO, this overrides the aspect_ratio setting.", + optional=True, ), - "magic_prompt_option": ( - IO.COMBO, - { - "options": ["AUTO", "ON", "OFF"], - "default": "AUTO", - "tooltip": "Determine if MagicPrompt should be used in generation", - }, + comfy_io.Combo.Input( + "magic_prompt_option", + options=["AUTO", "ON", "OFF"], + default="AUTO", + tooltip="Determine if MagicPrompt should be used in generation", + optional=True, ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2147483647, - "step": 1, - "control_after_generate": True, - "display": "number", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + control_after_generate=True, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), - "style_type": ( - IO.COMBO, - { - "options": ["AUTO", "GENERAL", "REALISTIC", "DESIGN", "RENDER_3D", "ANIME"], - "default": "NONE", - "tooltip": "Style type for generation (V2 only)", - }, + comfy_io.Combo.Input( + "style_type", + options=["AUTO", "GENERAL", "REALISTIC", "DESIGN", "RENDER_3D", "ANIME"], + default="NONE", + tooltip="Style type for generation (V2 only)", + optional=True, ), - "negative_prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Description of what to exclude from the image", - }, + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Description of what to exclude from the image", + optional=True, ), - "num_images": ( - IO.INT, - {"default": 1, "min": 1, "max": 8, "step": 1, "display": "number"}, + comfy_io.Int.Input( + "num_images", + default=1, + min=1, + max=8, + step=1, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), #"color_palette": ( # IO.STRING, @@ -473,22 +458,20 @@ class IdeogramV2(ComfyNodeABC): # "tooltip": "Color palette preset name or hex colors with weights", # }, #), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + ) - RETURN_TYPES = (IO.IMAGE,) - FUNCTION = "api_call" - CATEGORY = "api node/image/Ideogram" - DESCRIPTION = cleandoc(__doc__ or "") - API_NODE = True - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt, turbo=False, aspect_ratio="1:1", @@ -499,8 +482,6 @@ class IdeogramV2(ComfyNodeABC): negative_prompt="", num_images=1, color_palette="", - unique_id=None, - **kwargs, ): aspect_ratio = V1_V2_RATIO_MAP.get(aspect_ratio, None) resolution = V1_V1_RES_MAP.get(resolution, None) @@ -517,6 +498,10 @@ class IdeogramV2(ComfyNodeABC): else: final_aspect_ratio = aspect_ratio if aspect_ratio != "ASPECT_1_1" else None + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/ideogram/generate", @@ -540,7 +525,7 @@ class IdeogramV2(ComfyNodeABC): color_palette=color_palette if color_palette else None, ) ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) response = await operation.execute() @@ -553,108 +538,99 @@ class IdeogramV2(ComfyNodeABC): if not image_urls: raise Exception("No image URLs were generated in the response") - display_image_urls_on_node(image_urls, unique_id) - return (await download_and_process_images(image_urls),) + display_image_urls_on_node(image_urls, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_and_process_images(image_urls)) -class IdeogramV3(ComfyNodeABC): - """ - Generates images using the Ideogram V3 model. Supports both regular image generation from text prompts and image editing with mask. - """ - def __init__(self): - pass +class IdeogramV3(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation or editing", - }, + def define_schema(cls): + return comfy_io.Schema( + node_id="IdeogramV3", + display_name="Ideogram V3", + category="api node/image/Ideogram", + description="Generates images using the Ideogram V3 model. " + "Supports both regular image generation from text prompts and image editing with mask.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation or editing", ), - }, - "optional": { - "image": ( - IO.IMAGE, - { - "default": None, - "tooltip": "Optional reference image for image editing.", - }, + comfy_io.Image.Input( + "image", + tooltip="Optional reference image for image editing.", + optional=True, ), - "mask": ( - IO.MASK, - { - "default": None, - "tooltip": "Optional mask for inpainting (white areas will be replaced)", - }, + comfy_io.Mask.Input( + "mask", + tooltip="Optional mask for inpainting (white areas will be replaced)", + optional=True, ), - "aspect_ratio": ( - IO.COMBO, - { - "options": list(V3_RATIO_MAP.keys()), - "default": "1:1", - "tooltip": "The aspect ratio for image generation. Ignored if resolution is not set to Auto.", - }, + comfy_io.Combo.Input( + "aspect_ratio", + options=list(V3_RATIO_MAP.keys()), + default="1:1", + tooltip="The aspect ratio for image generation. Ignored if resolution is not set to Auto.", + optional=True, ), - "resolution": ( - IO.COMBO, - { - "options": V3_RESOLUTIONS, - "default": "Auto", - "tooltip": "The resolution for image generation. If not set to Auto, this overrides the aspect_ratio setting.", - }, + comfy_io.Combo.Input( + "resolution", + options=V3_RESOLUTIONS, + default="Auto", + tooltip="The resolution for image generation. " + "If not set to Auto, this overrides the aspect_ratio setting.", + optional=True, ), - "magic_prompt_option": ( - IO.COMBO, - { - "options": ["AUTO", "ON", "OFF"], - "default": "AUTO", - "tooltip": "Determine if MagicPrompt should be used in generation", - }, + comfy_io.Combo.Input( + "magic_prompt_option", + options=["AUTO", "ON", "OFF"], + default="AUTO", + tooltip="Determine if MagicPrompt should be used in generation", + optional=True, ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2147483647, - "step": 1, - "control_after_generate": True, - "display": "number", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + control_after_generate=True, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), - "num_images": ( - IO.INT, - {"default": 1, "min": 1, "max": 8, "step": 1, "display": "number"}, + comfy_io.Int.Input( + "num_images", + default=1, + min=1, + max=8, + step=1, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), - "rendering_speed": ( - IO.COMBO, - { - "options": ["BALANCED", "TURBO", "QUALITY"], - "default": "BALANCED", - "tooltip": "Controls the trade-off between generation speed and quality", - }, + comfy_io.Combo.Input( + "rendering_speed", + options=["BALANCED", "TURBO", "QUALITY"], + default="BALANCED", + tooltip="Controls the trade-off between generation speed and quality", + optional=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + ) - RETURN_TYPES = (IO.IMAGE,) - FUNCTION = "api_call" - CATEGORY = "api node/image/Ideogram" - DESCRIPTION = cleandoc(__doc__ or "") - API_NODE = True - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt, image=None, mask=None, @@ -664,9 +640,11 @@ class IdeogramV3(ComfyNodeABC): seed=0, num_images=1, rendering_speed="BALANCED", - unique_id=None, - **kwargs, ): + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } # Check if both image and mask are provided for editing mode if image is not None and mask is not None: # Edit mode @@ -686,7 +664,7 @@ class IdeogramV3(ComfyNodeABC): # Process image img_np = (input_tensor.numpy() * 255).astype(np.uint8) img = Image.fromarray(img_np) - img_byte_arr = io.BytesIO() + img_byte_arr = BytesIO() img.save(img_byte_arr, format="PNG") img_byte_arr.seek(0) img_binary = img_byte_arr @@ -695,7 +673,7 @@ class IdeogramV3(ComfyNodeABC): # Process mask - white areas will be replaced mask_np = (mask.squeeze().cpu().numpy() * 255).astype(np.uint8) mask_img = Image.fromarray(mask_np) - mask_byte_arr = io.BytesIO() + mask_byte_arr = BytesIO() mask_img.save(mask_byte_arr, format="PNG") mask_byte_arr.seek(0) mask_binary = mask_byte_arr @@ -729,7 +707,7 @@ class IdeogramV3(ComfyNodeABC): "mask": mask_binary, }, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) elif image is not None or mask is not None: @@ -770,7 +748,7 @@ class IdeogramV3(ComfyNodeABC): response_model=IdeogramGenerateResponse, ), request=gen_request, - auth_kwargs=kwargs, + auth_kwargs=auth, ) # Execute the operation and process response @@ -784,18 +762,18 @@ class IdeogramV3(ComfyNodeABC): if not image_urls: raise Exception("No image URLs were generated in the response") - display_image_urls_on_node(image_urls, unique_id) - return (await download_and_process_images(image_urls),) + display_image_urls_on_node(image_urls, cls.hidden.unique_id) + return comfy_io.NodeOutput(await download_and_process_images(image_urls)) -NODE_CLASS_MAPPINGS = { - "IdeogramV1": IdeogramV1, - "IdeogramV2": IdeogramV2, - "IdeogramV3": IdeogramV3, -} +class IdeogramExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + IdeogramV1, + IdeogramV2, + IdeogramV3, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "IdeogramV1": "Ideogram V1", - "IdeogramV2": "Ideogram V2", - "IdeogramV3": "Ideogram V3", -} +async def comfy_entrypoint() -> IdeogramExtension: + return IdeogramExtension() From f7bd5e58dd03e799e02f6851b84b51e14ad0da7b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 21 Aug 2025 20:18:04 -0700 Subject: [PATCH 176/759] Make it easier to implement future qwen controlnets. (#9485) --- comfy/controlnet.py | 4 ++-- comfy/ldm/qwen_image/model.py | 16 +++++++++++++--- comfy/model_detection.py | 2 ++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 988acdb57..6cb69dcdf 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -236,11 +236,11 @@ class ControlNet(ControlBase): self.cond_hint = None compression_ratio = self.compression_ratio if self.vae is not None: - compression_ratio *= self.vae.downscale_ratio + compression_ratio *= self.vae.spacial_compression_encode() else: if self.latent_format is not None: raise ValueError("This Controlnet needs a VAE but none was provided, please use a ControlNetApply node with a VAE input and connect it.") - self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * compression_ratio, x_noisy.shape[2] * compression_ratio, self.upscale_algorithm, "center") + self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[-1] * compression_ratio, x_noisy.shape[-2] * compression_ratio, self.upscale_algorithm, "center") self.cond_hint = self.preprocess_image(self.cond_hint) if self.vae is not None: loaded_models = comfy.model_management.loaded_models(only_currently_used=True) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 2503583cb..d0e39833a 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -293,6 +293,7 @@ class QwenImageTransformer2DModel(nn.Module): guidance_embeds: bool = False, axes_dims_rope: Tuple[int, int, int] = (16, 56, 56), image_model=None, + final_layer=True, dtype=None, device=None, operations=None, @@ -300,6 +301,7 @@ class QwenImageTransformer2DModel(nn.Module): super().__init__() self.dtype = dtype self.patch_size = patch_size + self.in_channels = in_channels self.out_channels = out_channels or in_channels self.inner_dim = num_attention_heads * attention_head_dim @@ -329,9 +331,9 @@ class QwenImageTransformer2DModel(nn.Module): for _ in range(num_layers) ]) - self.norm_out = LastLayer(self.inner_dim, self.inner_dim, dtype=dtype, device=device, operations=operations) - self.proj_out = operations.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, dtype=dtype, device=device) - self.gradient_checkpointing = False + if final_layer: + self.norm_out = LastLayer(self.inner_dim, self.inner_dim, dtype=dtype, device=device, operations=operations) + self.proj_out = operations.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, dtype=dtype, device=device) def process_img(self, x, index=0, h_offset=0, w_offset=0): bs, c, t, h, w = x.shape @@ -362,6 +364,7 @@ class QwenImageTransformer2DModel(nn.Module): guidance: torch.Tensor = None, ref_latents=None, transformer_options={}, + control=None, **kwargs ): timestep = timesteps @@ -443,6 +446,13 @@ class QwenImageTransformer2DModel(nn.Module): hidden_states = out["img"] encoder_hidden_states = out["txt"] + if control is not None: # Controlnet + control_i = control.get("input") + if i < len(control_i): + add = control_i[i] + if add is not None: + hidden_states += add + hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 2bec0541e..0caff53e0 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -492,6 +492,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): if '{}txt_norm.weight'.format(key_prefix) in state_dict_keys: # Qwen Image dit_config = {} dit_config["image_model"] = "qwen_image" + dit_config["in_channels"] = state_dict['{}img_in.weight'.format(key_prefix)].shape[1] + dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.') return dit_config if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys: From ff57793659702d502506047445f0972b10b6b9fe Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 21 Aug 2025 21:53:11 -0700 Subject: [PATCH 177/759] Support InstantX Qwen controlnet. (#9488) --- comfy/controlnet.py | 13 +++++ comfy/ldm/qwen_image/controlnet.py | 77 ++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 comfy/ldm/qwen_image/controlnet.py diff --git a/comfy/controlnet.py b/comfy/controlnet.py index 6cb69dcdf..e3dfedf55 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -36,6 +36,7 @@ import comfy.ldm.cascade.controlnet import comfy.cldm.mmdit import comfy.ldm.hydit.controlnet import comfy.ldm.flux.controlnet +import comfy.ldm.qwen_image.controlnet import comfy.cldm.dit_embedder from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -582,6 +583,15 @@ def load_controlnet_flux_instantx(sd, model_options={}): control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds) return control +def load_controlnet_qwen_instantx(sd, model_options={}): + model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options) + control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config) + control_model = controlnet_load_state_dict(control_model, sd) + latent_format = comfy.latent_formats.Wan21() + extra_conds = [] + control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds) + return control + def convert_mistoline(sd): return comfy.utils.state_dict_prefix_replace(sd, {"single_controlnet_blocks.": "controlnet_single_blocks."}) @@ -655,8 +665,11 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}): return load_controlnet_sd35(controlnet_data, model_options=model_options) #Stability sd3.5 format else: return load_controlnet_mmdit(controlnet_data, model_options=model_options) #SD3 diffusers controlnet + elif "transformer_blocks.0.img_mlp.net.0.proj.weight" in controlnet_data: + return load_controlnet_qwen_instantx(controlnet_data, model_options=model_options) elif "controlnet_x_embedder.weight" in controlnet_data: return load_controlnet_flux_instantx(controlnet_data, model_options=model_options) + elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux return load_controlnet_flux_xlabs_mistoline(convert_mistoline(controlnet_data), mistoline=True, model_options=model_options) diff --git a/comfy/ldm/qwen_image/controlnet.py b/comfy/ldm/qwen_image/controlnet.py new file mode 100644 index 000000000..92ac3cf0a --- /dev/null +++ b/comfy/ldm/qwen_image/controlnet.py @@ -0,0 +1,77 @@ +import torch +import math + +from .model import QwenImageTransformer2DModel + + +class QwenImageControlNetModel(QwenImageTransformer2DModel): + def __init__( + self, + extra_condition_channels=0, + dtype=None, + device=None, + operations=None, + **kwargs + ): + super().__init__(final_layer=False, dtype=dtype, device=device, operations=operations, **kwargs) + self.main_model_double = 60 + + # controlnet_blocks + self.controlnet_blocks = torch.nn.ModuleList([]) + for _ in range(len(self.transformer_blocks)): + self.controlnet_blocks.append(operations.Linear(self.inner_dim, self.inner_dim, device=device, dtype=dtype)) + self.controlnet_x_embedder = operations.Linear(self.in_channels + extra_condition_channels, self.inner_dim, device=device, dtype=dtype) + + def forward( + self, + x, + timesteps, + context, + attention_mask=None, + guidance: torch.Tensor = None, + ref_latents=None, + hint=None, + transformer_options={}, + **kwargs + ): + timestep = timesteps + encoder_hidden_states = context + encoder_hidden_states_mask = attention_mask + + hidden_states, img_ids, orig_shape = self.process_img(x) + hint, _, _ = self.process_img(hint) + + txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2)) + txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) + ids = torch.cat((txt_ids, img_ids), dim=1) + image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) + del ids, txt_ids, img_ids + + hidden_states = self.img_in(hidden_states) + self.controlnet_x_embedder(hint) + encoder_hidden_states = self.txt_norm(encoder_hidden_states) + encoder_hidden_states = self.txt_in(encoder_hidden_states) + + if guidance is not None: + guidance = guidance * 1000 + + temb = ( + self.time_text_embed(timestep, hidden_states) + if guidance is None + else self.time_text_embed(timestep, guidance, hidden_states) + ) + + repeat = math.ceil(self.main_model_double / len(self.controlnet_blocks)) + + controlnet_block_samples = () + for i, block in enumerate(self.transformer_blocks): + encoder_hidden_states, hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + encoder_hidden_states_mask=encoder_hidden_states_mask, + temb=temb, + image_rotary_emb=image_rotary_emb, + ) + + controlnet_block_samples = controlnet_block_samples + (self.controlnet_blocks[i](hidden_states),) * repeat + + return {"input": controlnet_block_samples[:self.main_model_double]} From 497d41fb500668635aa4a782549e9a0caa24375e Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 22 Aug 2025 20:50:35 +0300 Subject: [PATCH 178/759] feat(api-nodes): change "OpenAI Chat" display name to "OpenAI ChatGPT" (#9443) --- comfy_api_nodes/nodes_openai.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index 674c9ede0..e3b81de75 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -998,7 +998,7 @@ NODE_DISPLAY_NAME_MAPPINGS = { "OpenAIDalle2": "OpenAI DALL·E 2", "OpenAIDalle3": "OpenAI DALL·E 3", "OpenAIGPTImage1": "OpenAI GPT Image 1", - "OpenAIChatNode": "OpenAI Chat", - "OpenAIInputFiles": "OpenAI Chat Input Files", - "OpenAIChatConfig": "OpenAI Chat Advanced Options", + "OpenAIChatNode": "OpenAI ChatGPT", + "OpenAIInputFiles": "OpenAI ChatGPT Input Files", + "OpenAIChatConfig": "OpenAI ChatGPT Advanced Options", } From 050c67323c33f6543309d4f09df706ec8c9a1389 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 22 Aug 2025 20:51:14 +0300 Subject: [PATCH 179/759] feat(api-nodes): add copy button to Gemini Chat node (#9440) --- comfy_api_nodes/nodes_gemini.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index ba4167a50..78c402a7a 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -5,7 +5,10 @@ See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/infer from __future__ import annotations +import json +import time import os +import uuid from enum import Enum from typing import Optional, Literal @@ -350,7 +353,27 @@ class GeminiNode(ComfyNodeABC): # Get result output output_text = self.get_text_from_response(response) if unique_id and output_text: - PromptServer.instance.send_progress_text(output_text, node_id=unique_id) + # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. + render_spec = { + "node_id": unique_id, + "component": "ChatHistoryWidget", + "props": { + "history": json.dumps( + [ + { + "prompt": prompt, + "response": output_text, + "response_id": str(uuid.uuid4()), + "timestamp": time.time(), + } + ] + ), + }, + } + PromptServer.instance.send_sync( + "display_component", + render_spec, + ) return (output_text or "Empty response from Gemini model...",) From ca4e96a8ae6c9ee8d40fe35100ed9b2247e71e40 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sat, 23 Aug 2025 05:40:18 +0800 Subject: [PATCH 180/759] Update template to 0.1.65 (#9501) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8d928d826..6b53fabc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.9 -comfyui-workflow-templates==0.1.62 +comfyui-workflow-templates==0.1.65 comfyui-embedded-docs==0.2.6 torch torchsde From fe31ad02768c66c61b3dc12f5d4bdfe8990ce25c Mon Sep 17 00:00:00 2001 From: contentis Date: Sat, 23 Aug 2025 01:39:15 +0200 Subject: [PATCH 181/759] Add elementwise fusions (#9495) * Add elementwise fusions * Add addcmul pattern to Qwen --- comfy/ldm/modules/diffusionmodules/mmdit.py | 12 +++++++----- comfy/ldm/qwen_image/model.py | 12 ++++++------ comfy/ldm/wan/model.py | 14 +++++++------- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/comfy/ldm/modules/diffusionmodules/mmdit.py b/comfy/ldm/modules/diffusionmodules/mmdit.py index eaf3e73a4..4d6beba2d 100644 --- a/comfy/ldm/modules/diffusionmodules/mmdit.py +++ b/comfy/ldm/modules/diffusionmodules/mmdit.py @@ -109,7 +109,7 @@ class PatchEmbed(nn.Module): def modulate(x, shift, scale): if shift is None: shift = torch.zeros_like(scale) - return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) + return torch.addcmul(shift.unsqueeze(1), x, 1+ scale.unsqueeze(1)) ################################################################################# @@ -564,10 +564,7 @@ class DismantledBlock(nn.Module): assert not self.pre_only attn1 = self.attn.post_attention(attn) attn2 = self.attn2.post_attention(attn2) - out1 = gate_msa.unsqueeze(1) * attn1 - out2 = gate_msa2.unsqueeze(1) * attn2 - x = x + out1 - x = x + out2 + x = gate_cat(x, gate_msa, gate_msa2, attn1, attn2) x = x + gate_mlp.unsqueeze(1) * self.mlp( modulate(self.norm2(x), shift_mlp, scale_mlp) ) @@ -594,6 +591,11 @@ class DismantledBlock(nn.Module): ) return self.post_attention(attn, *intermediates) +def gate_cat(x, gate_msa, gate_msa2, attn1, attn2): + out1 = gate_msa.unsqueeze(1) * attn1 + out2 = gate_msa2.unsqueeze(1) * attn2 + x = torch.stack([x, out1, out2], dim=0).sum(dim=0) + return x def block_mixing(*args, use_checkpoint=True, **kwargs): if use_checkpoint: diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index d0e39833a..af00ff119 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -214,9 +214,9 @@ class QwenImageTransformerBlock(nn.Module): operations=operations, ) - def _modulate(self, x, mod_params): - shift, scale, gate = mod_params.chunk(3, dim=-1) - return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1) + def _modulate(self, x: torch.Tensor, mod_params: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + shift, scale, gate = torch.chunk(mod_params, 3, dim=-1) + return torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1)), gate.unsqueeze(1) def forward( self, @@ -248,11 +248,11 @@ class QwenImageTransformerBlock(nn.Module): img_normed2 = self.img_norm2(hidden_states) img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2) - hidden_states = hidden_states + img_gate2 * self.img_mlp(img_modulated2) + hidden_states = torch.addcmul(hidden_states, img_gate2, self.img_mlp(img_modulated2)) txt_normed2 = self.txt_norm2(encoder_hidden_states) txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2) - encoder_hidden_states = encoder_hidden_states + txt_gate2 * self.txt_mlp(txt_modulated2) + encoder_hidden_states = torch.addcmul(encoder_hidden_states, txt_gate2, self.txt_mlp(txt_modulated2)) return encoder_hidden_states, hidden_states @@ -275,7 +275,7 @@ class LastLayer(nn.Module): def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: emb = self.linear(self.silu(conditioning_embedding)) scale, shift = torch.chunk(emb, 2, dim=1) - x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :] + x = torch.addcmul(shift[:, None, :], self.norm(x), (1 + scale)[:, None, :]) return x diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 9d3741be3..0726b8e1b 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -148,8 +148,8 @@ WAN_CROSSATTENTION_CLASSES = { def repeat_e(e, x): repeats = 1 - if e.shape[1] > 1: - repeats = x.shape[1] // e.shape[1] + if e.size(1) > 1: + repeats = x.size(1) // e.size(1) if repeats == 1: return e return torch.repeat_interleave(e, repeats, dim=1) @@ -219,15 +219,15 @@ class WanAttentionBlock(nn.Module): # self-attention y = self.self_attn( - self.norm1(x) * (1 + repeat_e(e[1], x)) + repeat_e(e[0], x), + torch.addcmul(repeat_e(e[0], x), self.norm1(x), 1 + repeat_e(e[1], x)), freqs) - x = x + y * repeat_e(e[2], x) + x = torch.addcmul(x, y, repeat_e(e[2], x)) # cross-attention & ffn x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len) - y = self.ffn(self.norm2(x) * (1 + repeat_e(e[4], x)) + repeat_e(e[3], x)) - x = x + y * repeat_e(e[5], x) + y = self.ffn(torch.addcmul(repeat_e(e[3], x), self.norm2(x), 1 + repeat_e(e[4], x))) + x = torch.addcmul(x, y, repeat_e(e[5], x)) return x @@ -342,7 +342,7 @@ class Head(nn.Module): else: e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e.unsqueeze(2)).unbind(2) - x = (self.head(self.norm(x) * (1 + repeat_e(e[1], x)) + repeat_e(e[0], x))) + x = (self.head(torch.addcmul(repeat_e(e[0], x), self.norm(x), 1 + repeat_e(e[1], x)))) return x From fc247150fec502b1834390516b556a87003f1d79 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Fri, 22 Aug 2025 19:41:08 -0700 Subject: [PATCH 182/759] Implement EasyCache and Invent LazyCache (#9496) * Attempting a universal implementation of EasyCache, starting with flux as test; I screwed up the math a bit, but when I set it just right it works. * Fixed math to make threshold work as expected, refactored code to use EasyCacheHolder instead of a dict wrapped by object * Use sigmas from transformer_options instead of timesteps to be compatible with a greater amount of models, make end_percent work * Make log statement when not skipping useful, preparing for per-cond caching * Added DIFFUSION_MODEL wrapper around forward function for wan model * Add subsampling for heuristic inputs * Add subsampling to output_prev (output_prev_subsampled now) * Properly consider conds in EasyCache logic * Created SuperEasyCache to test what happens if caching and reuse is moved outside the scope of conds, added PREDICT_NOISE wrapper to facilitate this test * Change max reuse_threshold to 3.0 * Mark EasyCache/SuperEasyCache as experimental (beta) * Make Lumina2 compatible with EasyCache * Add EasyCache support for Qwen Image * Fix missing comma, curse you Cursor * Add EasyCache support to AceStep * Add EasyCache support to Chroma * Added EasyCache support to Cosmos Predict t2i * Make EasyCache not crash with Cosmos Predict ImagToVideo latents, but does not work well at all * Add EasyCache support to hidream * Added EasyCache support to hunyuan video * Added EasyCache support to hunyuan3d * Added EasyCache support to LTXV (not very good, but does not crash) * Implemented EasyCache for aura_flow * Renamed SuperEasyCache to LazyCache, hardcoded subsample_factor to 8 on nodes * Eatra logging when verbose is true for EasyCache --- comfy/ldm/ace/model.py | 24 +- comfy/ldm/aura/mmdit.py | 8 + comfy/ldm/chroma/model.py | 8 + comfy/ldm/cosmos/model.py | 38 +++ comfy/ldm/cosmos/predict2.py | 17 +- comfy/ldm/flux/model.py | 8 + comfy/ldm/hidream/model.py | 19 +- comfy/ldm/hunyuan3d/model.py | 8 + comfy/ldm/hunyuan_video/model.py | 8 + comfy/ldm/lightricks/model.py | 8 + comfy/ldm/lumina/model.py | 10 +- comfy/ldm/qwen_image/model.py | 10 +- comfy/ldm/wan/model.py | 8 + comfy/patcher_extension.py | 1 + comfy/samplers.py | 9 +- comfy_extras/nodes_easycache.py | 459 +++++++++++++++++++++++++++++++ nodes.py | 3 +- 17 files changed, 639 insertions(+), 7 deletions(-) create mode 100644 comfy_extras/nodes_easycache.py diff --git a/comfy/ldm/ace/model.py b/comfy/ldm/ace/model.py index 12c524701..41d85eeb5 100644 --- a/comfy/ldm/ace/model.py +++ b/comfy/ldm/ace/model.py @@ -19,6 +19,7 @@ import torch from torch import nn import comfy.model_management +import comfy.patcher_extension from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps from .attention import LinearTransformerBlock, t2i_modulate @@ -343,7 +344,28 @@ class ACEStepTransformer2DModel(nn.Module): output = self.final_layer(hidden_states, embedded_timestep, output_length) return output - def forward( + def forward(self, + x, + timestep, + attention_mask=None, + context: Optional[torch.Tensor] = None, + text_attention_mask: Optional[torch.LongTensor] = None, + speaker_embeds: Optional[torch.FloatTensor] = None, + lyric_token_idx: Optional[torch.LongTensor] = None, + lyric_mask: Optional[torch.LongTensor] = None, + block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None, + controlnet_scale: Union[float, torch.Tensor] = 1.0, + lyrics_strength=1.0, + **kwargs + ): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, kwargs.get("transformer_options", {})) + ).execute(x, timestep, attention_mask, context, text_attention_mask, speaker_embeds, lyric_token_idx, lyric_mask, block_controlnet_hidden_states, + controlnet_scale, lyrics_strength, **kwargs) + + def _forward( self, x, timestep, diff --git a/comfy/ldm/aura/mmdit.py b/comfy/ldm/aura/mmdit.py index 1258ae11f..d7f32b5e8 100644 --- a/comfy/ldm/aura/mmdit.py +++ b/comfy/ldm/aura/mmdit.py @@ -9,6 +9,7 @@ import torch.nn.functional as F from comfy.ldm.modules.attention import optimized_attention import comfy.ops +import comfy.patcher_extension import comfy.ldm.common_dit def modulate(x, shift, scale): @@ -436,6 +437,13 @@ class MMDiT(nn.Module): return x + pos_encoding.reshape(1, -1, self.positional_encoding.shape[-1]) def forward(self, x, timestep, context, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, transformer_options, **kwargs) + + def _forward(self, x, timestep, context, transformer_options={}, **kwargs): patches_replace = transformer_options.get("patches_replace", {}) # patchify x, add PE b, c, h, w = x.shape diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py index 06021d4f2..5cff44dc8 100644 --- a/comfy/ldm/chroma/model.py +++ b/comfy/ldm/chroma/model.py @@ -5,6 +5,7 @@ from dataclasses import dataclass import torch from torch import Tensor, nn from einops import rearrange, repeat +import comfy.patcher_extension import comfy.ldm.common_dit from comfy.ldm.flux.layers import ( @@ -253,6 +254,13 @@ class Chroma(nn.Module): return img def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, guidance, control, transformer_options, **kwargs) + + def _forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs): bs, c, h, w = x.shape x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size)) diff --git a/comfy/ldm/cosmos/model.py b/comfy/ldm/cosmos/model.py index 4836e0b69..53698b758 100644 --- a/comfy/ldm/cosmos/model.py +++ b/comfy/ldm/cosmos/model.py @@ -27,6 +27,8 @@ from torchvision import transforms from enum import Enum import logging +import comfy.patcher_extension + from .blocks import ( FinalLayer, GeneralDITTransformerBlock, @@ -435,6 +437,42 @@ class GeneralDIT(nn.Module): latent_condition_sigma: Optional[torch.Tensor] = None, condition_video_augment_sigma: Optional[torch.Tensor] = None, **kwargs, + ): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, kwargs.get("transformer_options", {})) + ).execute(x, + timesteps, + context, + attention_mask, + fps, + image_size, + padding_mask, + scalar_feature, + data_type, + latent_condition, + latent_condition_sigma, + condition_video_augment_sigma, + **kwargs) + + def _forward( + self, + x: torch.Tensor, + timesteps: torch.Tensor, + context: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + # crossattn_emb: torch.Tensor, + # crossattn_mask: Optional[torch.Tensor] = None, + fps: Optional[torch.Tensor] = None, + image_size: Optional[torch.Tensor] = None, + padding_mask: Optional[torch.Tensor] = None, + scalar_feature: Optional[torch.Tensor] = None, + data_type: Optional[DataType] = DataType.VIDEO, + latent_condition: Optional[torch.Tensor] = None, + latent_condition_sigma: Optional[torch.Tensor] = None, + condition_video_augment_sigma: Optional[torch.Tensor] = None, + **kwargs, ): """ Args: diff --git a/comfy/ldm/cosmos/predict2.py b/comfy/ldm/cosmos/predict2.py index 316117f77..fcc83ba76 100644 --- a/comfy/ldm/cosmos/predict2.py +++ b/comfy/ldm/cosmos/predict2.py @@ -11,6 +11,7 @@ import math from .position_embedding import VideoRopePosition3DEmb, LearnablePosEmbAxis from torchvision import transforms +import comfy.patcher_extension from comfy.ldm.modules.attention import optimized_attention def apply_rotary_pos_emb( @@ -805,7 +806,21 @@ class MiniTrainDIT(nn.Module): ) return x_B_C_Tt_Hp_Wp - def forward( + def forward(self, + x: torch.Tensor, + timesteps: torch.Tensor, + context: torch.Tensor, + fps: Optional[torch.Tensor] = None, + padding_mask: Optional[torch.Tensor] = None, + **kwargs, + ): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, kwargs.get("transformer_options", {})) + ).execute(x, timesteps, context, fps, padding_mask, **kwargs) + + def _forward( self, x: torch.Tensor, timesteps: torch.Tensor, diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index c4de82795..0a77fa097 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -6,6 +6,7 @@ import torch from torch import Tensor, nn from einops import rearrange, repeat import comfy.ldm.common_dit +import comfy.patcher_extension from .layers import ( DoubleStreamBlock, @@ -214,6 +215,13 @@ class Flux(nn.Module): return img, repeat(img_ids, "h w c -> b (h w) c", b=bs) def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, y, guidance, ref_latents, control, transformer_options, **kwargs) + + def _forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs): bs, c, h_orig, w_orig = x.shape patch_size = self.patch_size diff --git a/comfy/ldm/hidream/model.py b/comfy/ldm/hidream/model.py index 0305747bf..ae49cf945 100644 --- a/comfy/ldm/hidream/model.py +++ b/comfy/ldm/hidream/model.py @@ -13,6 +13,7 @@ from comfy.ldm.flux.layers import LastLayer from comfy.ldm.modules.attention import optimized_attention import comfy.model_management +import comfy.patcher_extension import comfy.ldm.common_dit @@ -692,7 +693,23 @@ class HiDreamImageTransformer2DModel(nn.Module): raise NotImplementedError return x, x_masks, img_sizes - def forward( + def forward(self, + x: torch.Tensor, + t: torch.Tensor, + y: Optional[torch.Tensor] = None, + context: Optional[torch.Tensor] = None, + encoder_hidden_states_llama3=None, + image_cond=None, + control = None, + transformer_options = {}, + ): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, t, y, context, encoder_hidden_states_llama3, image_cond, control, transformer_options) + + def _forward( self, x: torch.Tensor, t: torch.Tensor, diff --git a/comfy/ldm/hunyuan3d/model.py b/comfy/ldm/hunyuan3d/model.py index 4e18358f0..0fa5e78c1 100644 --- a/comfy/ldm/hunyuan3d/model.py +++ b/comfy/ldm/hunyuan3d/model.py @@ -7,6 +7,7 @@ from comfy.ldm.flux.layers import ( SingleStreamBlock, timestep_embedding, ) +import comfy.patcher_extension class Hunyuan3Dv2(nn.Module): @@ -67,6 +68,13 @@ class Hunyuan3Dv2(nn.Module): self.final_layer = LastLayer(hidden_size, 1, in_channels, dtype=dtype, device=device, operations=operations) def forward(self, x, timestep, context, guidance=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, guidance, transformer_options, **kwargs) + + def _forward(self, x, timestep, context, guidance=None, transformer_options={}, **kwargs): x = x.movedim(-1, -2) timestep = 1.0 - timestep txt = context diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py index fbd8d4196..da1011596 100644 --- a/comfy/ldm/hunyuan_video/model.py +++ b/comfy/ldm/hunyuan_video/model.py @@ -1,6 +1,7 @@ #Based on Flux code because of weird hunyuan video code license. import torch +import comfy.patcher_extension import comfy.ldm.flux.layers import comfy.ldm.modules.diffusionmodules.mmdit from comfy.ldm.modules.attention import optimized_attention @@ -348,6 +349,13 @@ class HunyuanVideo(nn.Module): return repeat(img_ids, "t h w c -> b (t h w) c", b=bs) def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, y, guidance, attention_mask, guiding_frame_index, ref_latent, control, transformer_options, **kwargs) + + def _forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs): bs, c, t, h, w = x.shape img_ids = self.img_ids(x) txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype) diff --git a/comfy/ldm/lightricks/model.py b/comfy/ldm/lightricks/model.py index ad9a7daea..aa2ea62b1 100644 --- a/comfy/ldm/lightricks/model.py +++ b/comfy/ldm/lightricks/model.py @@ -1,5 +1,6 @@ import torch from torch import nn +import comfy.patcher_extension import comfy.ldm.modules.attention import comfy.ldm.common_dit from einops import rearrange @@ -420,6 +421,13 @@ class LTXVModel(torch.nn.Module): self.patchifier = SymmetricPatchifier(1) def forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, attention_mask, frame_rate, transformer_options, keyframe_idxs, **kwargs) + + def _forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs): patches_replace = transformer_options.get("patches_replace", {}) orig_shape = list(x.shape) diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index f8dc4d7db..e08ed817d 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -11,6 +11,7 @@ import comfy.ldm.common_dit from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder from comfy.ldm.modules.attention import optimized_attention_masked from comfy.ldm.flux.layers import EmbedND +import comfy.patcher_extension def modulate(x, scale): @@ -590,8 +591,15 @@ class NextDiT(nn.Module): return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis - # def forward(self, x, t, cap_feats, cap_mask): def forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, kwargs.get("transformer_options", {})) + ).execute(x, timesteps, context, num_tokens, attention_mask, **kwargs) + + # def forward(self, x, t, cap_feats, cap_mask): + def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs): t = 1.0 - timesteps cap_feats = context cap_mask = attention_mask diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index af00ff119..57a458210 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -9,6 +9,7 @@ from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps from comfy.ldm.modules.attention import optimized_attention_masked from comfy.ldm.flux.layers import EmbedND import comfy.ldm.common_dit +import comfy.patcher_extension class GELU(nn.Module): def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True, dtype=None, device=None, operations=None): @@ -355,7 +356,14 @@ class QwenImageTransformer2DModel(nn.Module): img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) - (w_len // 2) return hidden_states, repeat(img_ids, "h w c -> b (h w) c", b=bs), orig_shape - def forward( + def forward(self, x, timestep, context, attention_mask=None, guidance=None, ref_latents=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, attention_mask, guidance, ref_latents, transformer_options, **kwargs) + + def _forward( self, x, timesteps, diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 0726b8e1b..1885d9730 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -11,6 +11,7 @@ from comfy.ldm.flux.layers import EmbedND from comfy.ldm.flux.math import apply_rope import comfy.ldm.common_dit import comfy.model_management +import comfy.patcher_extension def sinusoidal_embedding_1d(dim, position): @@ -573,6 +574,13 @@ class WanModel(torch.nn.Module): return x def forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, clip_fea, time_dim_concat, transformer_options, **kwargs) + + def _forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, **kwargs): bs, c, t, h, w = x.shape x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size) diff --git a/comfy/patcher_extension.py b/comfy/patcher_extension.py index 965958f4c..46cc7b2a8 100644 --- a/comfy/patcher_extension.py +++ b/comfy/patcher_extension.py @@ -50,6 +50,7 @@ class WrappersMP: OUTER_SAMPLE = "outer_sample" PREPARE_SAMPLING = "prepare_sampling" SAMPLER_SAMPLE = "sampler_sample" + PREDICT_NOISE = "predict_noise" CALC_COND_BATCH = "calc_cond_batch" APPLY_MODEL = "apply_model" DIFFUSION_MODEL = "diffusion_model" diff --git a/comfy/samplers.py b/comfy/samplers.py index d5390d64e..ec7e0b350 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -953,7 +953,14 @@ class CFGGuider: self.original_conds[k] = comfy.sampler_helpers.convert_cond(conds[k]) def __call__(self, *args, **kwargs): - return self.predict_noise(*args, **kwargs) + return self.outer_predict_noise(*args, **kwargs) + + def outer_predict_noise(self, x, timestep, model_options={}, seed=None): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self.predict_noise, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, self.model_options, is_model_options=True) + ).execute(x, timestep, model_options, seed) def predict_noise(self, x, timestep, model_options={}, seed=None): return sampling_function(self.inner_model, x, timestep, self.conds.get("negative", None), self.conds.get("positive", None), self.cfg, model_options=model_options, seed=seed) diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py new file mode 100644 index 000000000..e2b2efcd9 --- /dev/null +++ b/comfy_extras/nodes_easycache.py @@ -0,0 +1,459 @@ +from __future__ import annotations +from typing import TYPE_CHECKING, Union +from comfy_api.latest import io, ComfyExtension +import comfy.patcher_extension +import logging +import torch +import comfy.model_patcher +if TYPE_CHECKING: + from uuid import UUID + + +def easycache_forward_wrapper(executor, *args, **kwargs): + # get values from args + x: torch.Tensor = args[0] + transformer_options: dict[str] = args[-1] + if not isinstance(transformer_options, dict): + transformer_options = kwargs.get("transformer_options") + if not transformer_options: + transformer_options = args[-2] + easycache: EasyCacheHolder = transformer_options["easycache"] + sigmas = transformer_options["sigmas"] + uuids = transformer_options["uuids"] + if sigmas is not None and easycache.is_past_end_timestep(sigmas): + return executor(*args, **kwargs) + # prepare next x_prev + has_first_cond_uuid = easycache.has_first_cond_uuid(uuids) + next_x_prev = x + input_change = None + do_easycache = easycache.should_do_easycache(sigmas) + if do_easycache: + # if first cond marked this step for skipping, skip it and use appropriate cached values + if easycache.skip_current_step: + if easycache.verbose: + logging.info(f"EasyCache [verbose] - was marked to skip this step by {easycache.first_cond_uuid}. Present uuids: {uuids}") + return easycache.apply_cache_diff(x, uuids) + if easycache.initial_step: + easycache.first_cond_uuid = uuids[0] + has_first_cond_uuid = easycache.has_first_cond_uuid(uuids) + easycache.initial_step = False + if has_first_cond_uuid: + if easycache.has_x_prev_subsampled(): + input_change = (easycache.subsample(x, uuids, clone=False) - easycache.x_prev_subsampled).flatten().abs().mean() + if easycache.has_output_prev_norm() and easycache.has_relative_transformation_rate(): + approx_output_change_rate = (easycache.relative_transformation_rate * input_change) / easycache.output_prev_norm + easycache.cumulative_change_rate += approx_output_change_rate + if easycache.cumulative_change_rate < easycache.reuse_threshold: + if easycache.verbose: + logging.info(f"EasyCache [verbose] - skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}") + # other conds should also skip this step, and instead use their cached values + easycache.skip_current_step = True + return easycache.apply_cache_diff(x, uuids) + else: + if easycache.verbose: + logging.info(f"EasyCache [verbose] - NOT skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}") + easycache.cumulative_change_rate = 0.0 + + output: torch.Tensor = executor(*args, **kwargs) + if has_first_cond_uuid and easycache.has_output_prev_norm(): + output_change = (easycache.subsample(output, uuids, clone=False) - easycache.output_prev_subsampled).flatten().abs().mean() + if easycache.verbose: + output_change_rate = output_change / easycache.output_prev_norm + easycache.output_change_rates.append(output_change_rate.item()) + if easycache.has_relative_transformation_rate(): + approx_output_change_rate = (easycache.relative_transformation_rate * input_change) / easycache.output_prev_norm + easycache.approx_output_change_rates.append(approx_output_change_rate.item()) + if easycache.verbose: + logging.info(f"EasyCache [verbose] - approx_output_change_rate: {approx_output_change_rate}") + if input_change is not None: + easycache.relative_transformation_rate = output_change / input_change + if easycache.verbose: + logging.info(f"EasyCache [verbose] - output_change_rate: {output_change_rate}") + # TODO: allow cache_diff to be offloaded + easycache.update_cache_diff(output, next_x_prev, uuids) + if has_first_cond_uuid: + easycache.x_prev_subsampled = easycache.subsample(next_x_prev, uuids) + easycache.output_prev_subsampled = easycache.subsample(output, uuids) + easycache.output_prev_norm = output.flatten().abs().mean() + if easycache.verbose: + logging.info(f"EasyCache [verbose] - x_prev_subsampled: {easycache.x_prev_subsampled.shape}") + return output + +def lazycache_predict_noise_wrapper(executor, *args, **kwargs): + # get values from args + x: torch.Tensor = args[0] + timestep: float = args[1] + model_options: dict[str] = args[2] + easycache: LazyCacheHolder = model_options["transformer_options"]["easycache"] + if easycache.is_past_end_timestep(timestep): + return executor(*args, **kwargs) + # prepare next x_prev + next_x_prev = x + input_change = None + do_easycache = easycache.should_do_easycache(timestep) + if do_easycache: + if easycache.has_x_prev_subsampled(): + if easycache.has_x_prev_subsampled(): + input_change = (easycache.subsample(x, clone=False) - easycache.x_prev_subsampled).flatten().abs().mean() + if easycache.has_output_prev_norm() and easycache.has_relative_transformation_rate(): + approx_output_change_rate = (easycache.relative_transformation_rate * input_change) / easycache.output_prev_norm + easycache.cumulative_change_rate += approx_output_change_rate + if easycache.cumulative_change_rate < easycache.reuse_threshold: + if easycache.verbose: + logging.info(f"LazyCache [verbose] - skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}") + # other conds should also skip this step, and instead use their cached values + easycache.skip_current_step = True + return easycache.apply_cache_diff(x) + else: + if easycache.verbose: + logging.info(f"LazyCache [verbose] - NOT skipping step; cumulative_change_rate: {easycache.cumulative_change_rate}, reuse_threshold: {easycache.reuse_threshold}") + easycache.cumulative_change_rate = 0.0 + output: torch.Tensor = executor(*args, **kwargs) + if easycache.has_output_prev_norm(): + output_change = (easycache.subsample(output, clone=False) - easycache.output_prev_subsampled).flatten().abs().mean() + if easycache.verbose: + output_change_rate = output_change / easycache.output_prev_norm + easycache.output_change_rates.append(output_change_rate.item()) + if easycache.has_relative_transformation_rate(): + approx_output_change_rate = (easycache.relative_transformation_rate * input_change) / easycache.output_prev_norm + easycache.approx_output_change_rates.append(approx_output_change_rate.item()) + if easycache.verbose: + logging.info(f"LazyCache [verbose] - approx_output_change_rate: {approx_output_change_rate}") + if input_change is not None: + easycache.relative_transformation_rate = output_change / input_change + if easycache.verbose: + logging.info(f"LazyCache [verbose] - output_change_rate: {output_change_rate}") + # TODO: allow cache_diff to be offloaded + easycache.update_cache_diff(output, next_x_prev) + easycache.x_prev_subsampled = easycache.subsample(next_x_prev) + easycache.output_prev_subsampled = easycache.subsample(output) + easycache.output_prev_norm = output.flatten().abs().mean() + if easycache.verbose: + logging.info(f"LazyCache [verbose] - x_prev_subsampled: {easycache.x_prev_subsampled.shape}") + return output + +def easycache_calc_cond_batch_wrapper(executor, *args, **kwargs): + model_options = args[-1] + easycache: EasyCacheHolder = model_options["transformer_options"]["easycache"] + easycache.skip_current_step = False + # TODO: check if first_cond_uuid is active at this timestep; otherwise, EasyCache needs to be partially reset + return executor(*args, **kwargs) + +def easycache_sample_wrapper(executor, *args, **kwargs): + """ + This OUTER_SAMPLE wrapper makes sure easycache is prepped for current run, and all memory usage is cleared at the end. + """ + try: + guider = executor.class_obj + orig_model_options = guider.model_options + guider.model_options = comfy.model_patcher.create_model_options_clone(orig_model_options) + # clone and prepare timesteps + guider.model_options["transformer_options"]["easycache"] = guider.model_options["transformer_options"]["easycache"].clone().prepare_timesteps(guider.model_patcher.model.model_sampling) + easycache: Union[EasyCacheHolder, LazyCacheHolder] = guider.model_options['transformer_options']['easycache'] + logging.info(f"{easycache.name} enabled - threshold: {easycache.reuse_threshold}, start_percent: {easycache.start_percent}, end_percent: {easycache.end_percent}") + return executor(*args, **kwargs) + finally: + easycache = guider.model_options['transformer_options']['easycache'] + output_change_rates = easycache.output_change_rates + approx_output_change_rates = easycache.approx_output_change_rates + if easycache.verbose: + logging.info(f"{easycache.name} [verbose] - output_change_rates {len(output_change_rates)}: {output_change_rates}") + logging.info(f"{easycache.name} [verbose] - approx_output_change_rates {len(approx_output_change_rates)}: {approx_output_change_rates}") + total_steps = len(args[3])-1 + logging.info(f"{easycache.name} - skipped {easycache.total_steps_skipped}/{total_steps} steps ({total_steps/(total_steps-easycache.total_steps_skipped):.2f}x speedup).") + easycache.reset() + guider.model_options = orig_model_options + + +class EasyCacheHolder: + def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False): + self.name = "EasyCache" + self.reuse_threshold = reuse_threshold + self.start_percent = start_percent + self.end_percent = end_percent + self.subsample_factor = subsample_factor + self.offload_cache_diff = offload_cache_diff + self.verbose = verbose + # timestep values + self.start_t = 0.0 + self.end_t = 0.0 + # control values + self.relative_transformation_rate: float = None + self.cumulative_change_rate = 0.0 + self.initial_step = True + self.skip_current_step = False + # cache values + self.first_cond_uuid = None + self.x_prev_subsampled: torch.Tensor = None + self.output_prev_subsampled: torch.Tensor = None + self.output_prev_norm: torch.Tensor = None + self.uuid_cache_diffs: dict[UUID, torch.Tensor] = {} + self.output_change_rates = [] + self.approx_output_change_rates = [] + self.total_steps_skipped = 0 + # how to deal with mismatched dims + self.allow_mismatch = True + self.cut_from_start = True + + def is_past_end_timestep(self, timestep: float) -> bool: + return not (timestep[0] > self.end_t).item() + + def should_do_easycache(self, timestep: float) -> bool: + return (timestep[0] <= self.start_t).item() + + def has_x_prev_subsampled(self) -> bool: + return self.x_prev_subsampled is not None + + def has_output_prev_subsampled(self) -> bool: + return self.output_prev_subsampled is not None + + def has_output_prev_norm(self) -> bool: + return self.output_prev_norm is not None + + def has_relative_transformation_rate(self) -> bool: + return self.relative_transformation_rate is not None + + def prepare_timesteps(self, model_sampling): + self.start_t = model_sampling.percent_to_sigma(self.start_percent) + self.end_t = model_sampling.percent_to_sigma(self.end_percent) + return self + + def subsample(self, x: torch.Tensor, uuids: list[UUID], clone: bool = True) -> torch.Tensor: + batch_offset = x.shape[0] // len(uuids) + uuid_idx = uuids.index(self.first_cond_uuid) + if self.subsample_factor > 1: + to_return = x[uuid_idx*batch_offset:(uuid_idx+1)*batch_offset, ..., ::self.subsample_factor, ::self.subsample_factor] + if clone: + return to_return.clone() + return to_return + to_return = x[uuid_idx*batch_offset:(uuid_idx+1)*batch_offset, ...] + if clone: + return to_return.clone() + return to_return + + def apply_cache_diff(self, x: torch.Tensor, uuids: list[UUID]): + if self.first_cond_uuid in uuids: + self.total_steps_skipped += 1 + batch_offset = x.shape[0] // len(uuids) + for i, uuid in enumerate(uuids): + # if cached dims don't match x dims, cut off excess and hope for the best (cosmos world2video) + if x.shape[1:] != self.uuid_cache_diffs[uuid].shape[1:]: + if not self.allow_mismatch: + raise ValueError(f"Cached dims {self.uuid_cache_diffs[uuid].shape} don't match x dims {x.shape} - this is no good") + slicing = [] + skip_this_dim = True + for dim_u, dim_x in zip(self.uuid_cache_diffs[uuid].shape, x.shape): + if skip_this_dim: + skip_this_dim = False + continue + if dim_u != dim_x: + if self.cut_from_start: + slicing.append(slice(dim_x-dim_u, None)) + else: + slicing.append(slice(None, dim_u)) + else: + slicing.append(slice(None)) + slicing = [slice(i*batch_offset,(i+1)*batch_offset)] + slicing + x = x[slicing] + x += self.uuid_cache_diffs[uuid].to(x.device) + return x + + def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]): + # if output dims don't match x dims, cut off excess and hope for the best (cosmos world2video) + if output.shape[1:] != x.shape[1:]: + if not self.allow_mismatch: + raise ValueError(f"Output dims {output.shape} don't match x dims {x.shape} - this is no good") + slicing = [] + skip_dim = True + for dim_o, dim_x in zip(output.shape, x.shape): + if not skip_dim and dim_o != dim_x: + if self.cut_from_start: + slicing.append(slice(dim_x-dim_o, None)) + else: + slicing.append(slice(None, dim_o)) + else: + slicing.append(slice(None)) + skip_dim = False + x = x[slicing] + diff = output - x + batch_offset = diff.shape[0] // len(uuids) + for i, uuid in enumerate(uuids): + self.uuid_cache_diffs[uuid] = diff[i*batch_offset:(i+1)*batch_offset, ...] + + def has_first_cond_uuid(self, uuids: list[UUID]) -> bool: + return self.first_cond_uuid in uuids + + def reset(self): + self.relative_transformation_rate = 0.0 + self.cumulative_change_rate = 0.0 + self.initial_step = True + self.skip_current_step = False + self.output_change_rates = [] + self.first_cond_uuid = None + del self.x_prev_subsampled + self.x_prev_subsampled = None + del self.output_prev_subsampled + self.output_prev_subsampled = None + del self.output_prev_norm + self.output_prev_norm = None + del self.uuid_cache_diffs + self.uuid_cache_diffs = {} + self.total_steps_skipped = 0 + return self + + def clone(self): + return EasyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose) + + +class EasyCacheNode(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="EasyCache", + display_name="EasyCache", + description="Native EasyCache implementation.", + category="advanced/debug/model", + is_experimental=True, + inputs=[ + io.Model.Input("model", tooltip="The model to add EasyCache to."), + io.Float.Input("reuse_threshold", min=0.0, default=0.2, max=3.0, step=0.01, tooltip="The threshold for reusing cached steps."), + io.Float.Input("start_percent", min=0.0, default=0.15, max=1.0, step=0.01, tooltip="The relative sampling step to begin use of EasyCache."), + io.Float.Input("end_percent", min=0.0, default=0.95, max=1.0, step=0.01, tooltip="The relative sampling step to end use of EasyCache."), + io.Boolean.Input("verbose", default=False, tooltip="Whether to log verbose information."), + ], + outputs=[ + io.Model.Output(tooltip="The model with EasyCache."), + ], + ) + + @classmethod + def execute(cls, model: io.Model.Type, reuse_threshold: float, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput: + model = model.clone() + model.model_options["transformer_options"]["easycache"] = EasyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose) + model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "easycache", easycache_sample_wrapper) + model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.CALC_COND_BATCH, "easycache", easycache_calc_cond_batch_wrapper) + model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, "easycache", easycache_forward_wrapper) + return io.NodeOutput(model) + + +class LazyCacheHolder: + def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False): + self.name = "LazyCache" + self.reuse_threshold = reuse_threshold + self.start_percent = start_percent + self.end_percent = end_percent + self.subsample_factor = subsample_factor + self.offload_cache_diff = offload_cache_diff + self.verbose = verbose + # timestep values + self.start_t = 0.0 + self.end_t = 0.0 + # control values + self.relative_transformation_rate: float = None + self.cumulative_change_rate = 0.0 + self.initial_step = True + # cache values + self.x_prev_subsampled: torch.Tensor = None + self.output_prev_subsampled: torch.Tensor = None + self.output_prev_norm: torch.Tensor = None + self.cache_diff: torch.Tensor = None + self.output_change_rates = [] + self.approx_output_change_rates = [] + self.total_steps_skipped = 0 + + def has_cache_diff(self) -> bool: + return self.cache_diff is not None + + def is_past_end_timestep(self, timestep: float) -> bool: + return not (timestep[0] > self.end_t).item() + + def should_do_easycache(self, timestep: float) -> bool: + return (timestep[0] <= self.start_t).item() + + def has_x_prev_subsampled(self) -> bool: + return self.x_prev_subsampled is not None + + def has_output_prev_subsampled(self) -> bool: + return self.output_prev_subsampled is not None + + def has_output_prev_norm(self) -> bool: + return self.output_prev_norm is not None + + def has_relative_transformation_rate(self) -> bool: + return self.relative_transformation_rate is not None + + def prepare_timesteps(self, model_sampling): + self.start_t = model_sampling.percent_to_sigma(self.start_percent) + self.end_t = model_sampling.percent_to_sigma(self.end_percent) + return self + + def subsample(self, x: torch.Tensor, clone: bool = True) -> torch.Tensor: + if self.subsample_factor > 1: + to_return = x[..., ::self.subsample_factor, ::self.subsample_factor] + if clone: + return to_return.clone() + return to_return + if clone: + return x.clone() + return x + + def apply_cache_diff(self, x: torch.Tensor): + self.total_steps_skipped += 1 + return x + self.cache_diff.to(x.device) + + def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor): + self.cache_diff = output - x + + def reset(self): + self.relative_transformation_rate = 0.0 + self.cumulative_change_rate = 0.0 + self.initial_step = True + self.output_change_rates = [] + self.approx_output_change_rates = [] + del self.cache_diff + self.cache_diff = None + self.total_steps_skipped = 0 + return self + + def clone(self): + return LazyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose) + +class LazyCacheNode(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="LazyCache", + display_name="LazyCache", + description="A homebrew version of EasyCache - even 'easier' version of EasyCache to implement. Overall works worse than EasyCache, but better in some rare cases AND universal compatibility with everything in ComfyUI.", + category="advanced/debug/model", + is_experimental=True, + inputs=[ + io.Model.Input("model", tooltip="The model to add LazyCache to."), + io.Float.Input("reuse_threshold", min=0.0, default=0.2, max=3.0, step=0.01, tooltip="The threshold for reusing cached steps."), + io.Float.Input("start_percent", min=0.0, default=0.15, max=1.0, step=0.01, tooltip="The relative sampling step to begin use of LazyCache."), + io.Float.Input("end_percent", min=0.0, default=0.95, max=1.0, step=0.01, tooltip="The relative sampling step to end use of LazyCache."), + io.Boolean.Input("verbose", default=False, tooltip="Whether to log verbose information."), + ], + outputs=[ + io.Model.Output(tooltip="The model with LazyCache."), + ], + ) + + @classmethod + def execute(cls, model: io.Model.Type, reuse_threshold: float, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput: + model = model.clone() + model.model_options["transformer_options"]["easycache"] = LazyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose) + model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "lazycache", easycache_sample_wrapper) + model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, "lazycache", lazycache_predict_noise_wrapper) + return io.NodeOutput(model) + + +class EasyCacheExtension(ComfyExtension): + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + EasyCacheNode, + LazyCacheNode, + ] + +def comfy_entrypoint(): + return EasyCacheExtension() diff --git a/nodes.py b/nodes.py index 9681750d3..723ce3384 100644 --- a/nodes.py +++ b/nodes.py @@ -2322,7 +2322,8 @@ async def init_builtin_extra_nodes(): "nodes_tcfg.py", "nodes_context_windows.py", "nodes_qwen.py", - "nodes_model_patch.py" + "nodes_model_patch.py", + "nodes_easycache.py", ] import_failed = [] From 41048c69b4ccf63f876213a95a51cdde1cb0ab84 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 22 Aug 2025 20:15:44 -0700 Subject: [PATCH 183/759] Fix Conditioning masks on 3d latents. (#9506) --- comfy/samplers.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/comfy/samplers.py b/comfy/samplers.py index ec7e0b350..c7dfef4ea 100644 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -17,6 +17,7 @@ import comfy.model_patcher import comfy.patcher_extension import comfy.hooks import comfy.context_windows +import comfy.utils import scipy.stats import numpy @@ -61,7 +62,7 @@ def get_area_and_mult(conds, x_in, timestep_in): if "mask_strength" in conds: mask_strength = conds["mask_strength"] mask = conds['mask'] - assert (mask.shape[1:] == x_in.shape[2:]) + # assert (mask.shape[1:] == x_in.shape[2:]) mask = mask[:input_x.shape[0]] if area is not None: @@ -69,7 +70,7 @@ def get_area_and_mult(conds, x_in, timestep_in): mask = mask.narrow(i + 1, area[len(dims) + i], area[i]) mask = mask * mask_strength - mask = mask.unsqueeze(1).repeat(input_x.shape[0] // mask.shape[0], input_x.shape[1], 1, 1) + mask = mask.unsqueeze(1).repeat((input_x.shape[0] // mask.shape[0], input_x.shape[1]) + (1, ) * (mask.ndim - 1)) else: mask = torch.ones_like(input_x) mult = mask * strength @@ -553,7 +554,10 @@ def resolve_areas_and_cond_masks_multidim(conditions, dims, device): if len(mask.shape) == len(dims): mask = mask.unsqueeze(0) if mask.shape[1:] != dims: - mask = torch.nn.functional.interpolate(mask.unsqueeze(1), size=dims, mode='bilinear', align_corners=False).squeeze(1) + if mask.ndim < 4: + mask = comfy.utils.common_upscale(mask.unsqueeze(1), dims[-1], dims[-2], 'bilinear', 'none').squeeze(1) + else: + mask = comfy.utils.common_upscale(mask, dims[-1], dims[-2], 'bilinear', 'none') if modified.get("set_area_to_bounds", False): #TODO: handle dim != 2 bounds = torch.max(torch.abs(mask),dim=0).values.unsqueeze(0) From 59eddda90030b61f172e155bc1e2526a51a27dff Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 22 Aug 2025 22:36:44 -0700 Subject: [PATCH 184/759] Python 3.13 is well supported. (#9511) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 79a8a8c79..99a50571b 100644 --- a/README.md +++ b/README.md @@ -192,7 +192,7 @@ comfy install ## Manual Install (Windows, Linux) -python 3.13 is supported but using 3.12 is recommended because some custom nodes and their dependencies might not support it yet. +Python 3.13 is very well supported. If you have trouble with some custom node dependencies you can try 3.12 Git clone this repo. From 8be0d22ab76a3d548c9c376fd816b39d4c028c12 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 23 Aug 2025 10:56:17 -0700 Subject: [PATCH 185/759] Don't use the annoying new navigation mode by default. (#9518) --- app/app_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/app_settings.py b/app/app_settings.py index c7ac73bf6..eb69133a3 100644 --- a/app/app_settings.py +++ b/app/app_settings.py @@ -25,7 +25,7 @@ class AppSettings(): logging.error(f"The user settings file is corrupted: {file}") return {} else: - return {} + return {"Comfy.Canvas.NavigationMode": "legacy"} def save_settings(self, request, settings): file = self.user_manager.get_request_user_filepath( From 3e316c6338503a535801db3ddac9572a38a607ef Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Sat, 23 Aug 2025 14:54:01 -0700 Subject: [PATCH 186/759] Update frontend to v1.25.10 and revert navigation mode override (#9522) - Update comfyui-frontend-package from 1.25.9 to 1.25.10 - Revert forced legacy navigation mode from PR #9518 - Frontend v1.25.10 includes proper navigation mode fixes and improved display text --- app/app_settings.py | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/app_settings.py b/app/app_settings.py index eb69133a3..c7ac73bf6 100644 --- a/app/app_settings.py +++ b/app/app_settings.py @@ -25,7 +25,7 @@ class AppSettings(): logging.error(f"The user settings file is corrupted: {file}") return {} else: - return {"Comfy.Canvas.NavigationMode": "legacy"} + return {} def save_settings(self, request, settings): file = self.user_manager.get_request_user_filepath( diff --git a/requirements.txt b/requirements.txt index 6b53fabc1..131484ce8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.25.9 +comfyui-frontend-package==1.25.10 comfyui-workflow-templates==0.1.65 comfyui-embedded-docs==0.2.6 torch From 71ed4a399ec76a75aa2870b772d2022e4b9a69a3 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Sat, 23 Aug 2025 18:57:09 -0400 Subject: [PATCH 187/759] ComfyUI version 0.3.52 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 65f06cf37..834c3e8c2 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.51" +__version__ = "0.3.52" diff --git a/pyproject.toml b/pyproject.toml index ecbf04303..f6e765a81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.51" +version = "0.3.52" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 95ac7794b7c735de8e5426442507d08edd29bec5 Mon Sep 17 00:00:00 2001 From: blepping <157360029+blepping@users.noreply.github.com> Date: Sun, 24 Aug 2025 13:29:49 -0600 Subject: [PATCH 188/759] Fix EasyCache/LazyCache crash when tensor shape/dtype/device changes during sampling (#9528) * Fix EasyCache/LazyCache crash when tensor shape/dtype/device changes during sampling * Fix missing LazyCache check_metadata method Ensure LazyCache reset method resets all the tensor state values --- comfy_extras/nodes_easycache.py | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py index e2b2efcd9..9d2988f5f 100644 --- a/comfy_extras/nodes_easycache.py +++ b/comfy_extras/nodes_easycache.py @@ -28,6 +28,7 @@ def easycache_forward_wrapper(executor, *args, **kwargs): input_change = None do_easycache = easycache.should_do_easycache(sigmas) if do_easycache: + easycache.check_metadata(x) # if first cond marked this step for skipping, skip it and use appropriate cached values if easycache.skip_current_step: if easycache.verbose: @@ -92,6 +93,7 @@ def lazycache_predict_noise_wrapper(executor, *args, **kwargs): input_change = None do_easycache = easycache.should_do_easycache(timestep) if do_easycache: + easycache.check_metadata(x) if easycache.has_x_prev_subsampled(): if easycache.has_x_prev_subsampled(): input_change = (easycache.subsample(x, clone=False) - easycache.x_prev_subsampled).flatten().abs().mean() @@ -194,6 +196,7 @@ class EasyCacheHolder: # how to deal with mismatched dims self.allow_mismatch = True self.cut_from_start = True + self.state_metadata = None def is_past_end_timestep(self, timestep: float) -> bool: return not (timestep[0] > self.end_t).item() @@ -283,6 +286,17 @@ class EasyCacheHolder: def has_first_cond_uuid(self, uuids: list[UUID]) -> bool: return self.first_cond_uuid in uuids + def check_metadata(self, x: torch.Tensor) -> bool: + metadata = (x.device, x.dtype, x.shape[1:]) + if self.state_metadata is None: + self.state_metadata = metadata + return True + if metadata == self.state_metadata: + return True + logging.warn(f"{self.name} - Tensor shape, dtype or device changed, resetting state") + self.reset() + return False + def reset(self): self.relative_transformation_rate = 0.0 self.cumulative_change_rate = 0.0 @@ -299,6 +313,7 @@ class EasyCacheHolder: del self.uuid_cache_diffs self.uuid_cache_diffs = {} self.total_steps_skipped = 0 + self.state_metadata = None return self def clone(self): @@ -360,6 +375,7 @@ class LazyCacheHolder: self.output_change_rates = [] self.approx_output_change_rates = [] self.total_steps_skipped = 0 + self.state_metadata = None def has_cache_diff(self) -> bool: return self.cache_diff is not None @@ -404,6 +420,17 @@ class LazyCacheHolder: def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor): self.cache_diff = output - x + def check_metadata(self, x: torch.Tensor) -> bool: + metadata = (x.device, x.dtype, x.shape) + if self.state_metadata is None: + self.state_metadata = metadata + return True + if metadata == self.state_metadata: + return True + logging.warn(f"{self.name} - Tensor shape, dtype or device changed, resetting state") + self.reset() + return False + def reset(self): self.relative_transformation_rate = 0.0 self.cumulative_change_rate = 0.0 @@ -412,7 +439,14 @@ class LazyCacheHolder: self.approx_output_change_rates = [] del self.cache_diff self.cache_diff = None + del self.x_prev_subsampled + self.x_prev_subsampled = None + del self.output_prev_subsampled + self.output_prev_subsampled = None + del self.output_prev_norm + self.output_prev_norm = None self.total_steps_skipped = 0 + self.state_metadata = None return self def clone(self): From f6b93d41a03081fad3c1a01221eac9c42d6790df Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 24 Aug 2025 12:40:32 -0700 Subject: [PATCH 189/759] Remove models from readme that are not fully implemented. (#9535) Cosmos model implementations are currently missing the safety part so it is technically not fully implemented and should not be advertised as such. --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 99a50571b..8024870c2 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,6 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/) - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/) - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/) - - [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/) - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/) - Image Editing Models - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/) @@ -77,7 +76,6 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/) - [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/) - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/) - - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/) and [Cosmos Predict2](https://comfyanonymous.github.io/ComfyUI_examples/cosmos_predict2/) - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/) - [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/) - Audio Models From e633a47ad1b875e52758be27ec34cb8907ebe1fb Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 25 Aug 2025 17:13:54 -0700 Subject: [PATCH 190/759] Add models/audio_encoders directory. (#9548) --- folder_paths.py | 2 ++ models/audio_encoders/put_audio_encoder_models_here | 0 2 files changed, 2 insertions(+) create mode 100644 models/audio_encoders/put_audio_encoder_models_here diff --git a/folder_paths.py b/folder_paths.py index b34af39e8..f110d832b 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -48,6 +48,8 @@ folder_names_and_paths["classifiers"] = ([os.path.join(models_dir, "classifiers" folder_names_and_paths["model_patches"] = ([os.path.join(models_dir, "model_patches")], supported_pt_extensions) +folder_names_and_paths["audio_encoders"] = ([os.path.join(models_dir, "audio_encoders")], supported_pt_extensions) + output_directory = os.path.join(base_path, "output") temp_directory = os.path.join(base_path, "temp") input_directory = os.path.join(base_path, "input") diff --git a/models/audio_encoders/put_audio_encoder_models_here b/models/audio_encoders/put_audio_encoder_models_here new file mode 100644 index 000000000..e69de29bb From 914c2a29731be9c082f773c4b95892f553ac5ae8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 25 Aug 2025 20:26:47 -0700 Subject: [PATCH 191/759] Implement wav2vec2 as an audio encoder model. (#9549) This is useless on its own but there are multiple models that use it. --- comfy/audio_encoders/audio_encoders.py | 42 +++++ comfy/audio_encoders/wav2vec2.py | 207 +++++++++++++++++++++++++ comfy_api/latest/_io.py | 8 + comfy_extras/nodes_audio_encoder.py | 44 ++++++ nodes.py | 1 + 5 files changed, 302 insertions(+) create mode 100644 comfy/audio_encoders/audio_encoders.py create mode 100644 comfy/audio_encoders/wav2vec2.py create mode 100644 comfy_extras/nodes_audio_encoder.py diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py new file mode 100644 index 000000000..538c21bd5 --- /dev/null +++ b/comfy/audio_encoders/audio_encoders.py @@ -0,0 +1,42 @@ +from .wav2vec2 import Wav2Vec2Model +import comfy.model_management +import comfy.ops +import comfy.utils +import logging +import torchaudio + + +class AudioEncoderModel(): + def __init__(self, config): + self.load_device = comfy.model_management.text_encoder_device() + offload_device = comfy.model_management.text_encoder_offload_device() + self.dtype = comfy.model_management.text_encoder_dtype(self.load_device) + self.model = Wav2Vec2Model(dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast) + self.model.eval() + self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device) + self.model_sample_rate = 16000 + + def load_sd(self, sd): + return self.model.load_state_dict(sd, strict=False) + + def get_sd(self): + return self.model.state_dict() + + def encode_audio(self, audio, sample_rate): + comfy.model_management.load_model_gpu(self.patcher) + audio = torchaudio.functional.resample(audio, sample_rate, self.model_sample_rate) + out, all_layers = self.model(audio.to(self.load_device)) + outputs = {} + outputs["encoded_audio"] = out + outputs["encoded_audio_all_layers"] = all_layers + return outputs + + +def load_audio_encoder_from_sd(sd, prefix=""): + audio_encoder = AudioEncoderModel(None) + sd = comfy.utils.state_dict_prefix_replace(sd, {"wav2vec2.": ""}) + m, u = audio_encoder.load_sd(sd) + if len(m) > 0: + logging.warning("missing audio encoder: {}".format(m)) + + return audio_encoder diff --git a/comfy/audio_encoders/wav2vec2.py b/comfy/audio_encoders/wav2vec2.py new file mode 100644 index 000000000..de906622a --- /dev/null +++ b/comfy/audio_encoders/wav2vec2.py @@ -0,0 +1,207 @@ +import torch +import torch.nn as nn +from comfy.ldm.modules.attention import optimized_attention_masked + + +class LayerNormConv(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride, bias=False, dtype=None, device=None, operations=None): + super().__init__() + self.conv = operations.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=bias, device=device, dtype=dtype) + self.layer_norm = operations.LayerNorm(out_channels, elementwise_affine=True, device=device, dtype=dtype) + + def forward(self, x): + x = self.conv(x) + return torch.nn.functional.gelu(self.layer_norm(x.transpose(-2, -1)).transpose(-2, -1)) + + +class ConvFeatureEncoder(nn.Module): + def __init__(self, conv_dim, dtype=None, device=None, operations=None): + super().__init__() + self.conv_layers = nn.ModuleList([ + LayerNormConv(1, conv_dim, kernel_size=10, stride=5, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations), + ]) + + def forward(self, x): + x = x.unsqueeze(1) + + for conv in self.conv_layers: + x = conv(x) + + return x.transpose(1, 2) + + +class FeatureProjection(nn.Module): + def __init__(self, conv_dim, embed_dim, dtype=None, device=None, operations=None): + super().__init__() + self.layer_norm = operations.LayerNorm(conv_dim, eps=1e-05, device=device, dtype=dtype) + self.projection = operations.Linear(conv_dim, embed_dim, device=device, dtype=dtype) + + def forward(self, x): + x = self.layer_norm(x) + x = self.projection(x) + return x + + +class PositionalConvEmbedding(nn.Module): + def __init__(self, embed_dim=768, kernel_size=128, groups=16): + super().__init__() + self.conv = nn.Conv1d( + embed_dim, + embed_dim, + kernel_size=kernel_size, + padding=kernel_size // 2, + groups=groups, + ) + self.conv = torch.nn.utils.parametrizations.weight_norm(self.conv, name="weight", dim=2) + self.activation = nn.GELU() + + def forward(self, x): + x = x.transpose(1, 2) + x = self.conv(x)[:, :, :-1] + x = self.activation(x) + x = x.transpose(1, 2) + return x + + +class TransformerEncoder(nn.Module): + def __init__( + self, + embed_dim=768, + num_heads=12, + num_layers=12, + mlp_ratio=4.0, + dtype=None, device=None, operations=None + ): + super().__init__() + + self.pos_conv_embed = PositionalConvEmbedding(embed_dim=embed_dim) + self.layers = nn.ModuleList([ + TransformerEncoderLayer( + embed_dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + device=device, dtype=dtype, operations=operations + ) + for _ in range(num_layers) + ]) + + self.layer_norm = operations.LayerNorm(embed_dim, eps=1e-05, device=device, dtype=dtype) + + def forward(self, x, mask=None): + x = x + self.pos_conv_embed(x) + all_x = () + for layer in self.layers: + all_x += (x,) + x = layer(x, mask) + x = self.layer_norm(x) + all_x += (x,) + return x, all_x + + +class Attention(nn.Module): + def __init__(self, embed_dim, num_heads, bias=True, dtype=None, device=None, operations=None): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + + self.k_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype) + self.v_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype) + self.q_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype) + self.out_proj = operations.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype) + + def forward(self, x, mask=None): + assert (mask is None) # TODO? + q = self.q_proj(x) + k = self.k_proj(x) + v = self.v_proj(x) + + out = optimized_attention_masked(q, k, v, self.num_heads) + return self.out_proj(out) + + +class FeedForward(nn.Module): + def __init__(self, embed_dim, mlp_ratio, dtype=None, device=None, operations=None): + super().__init__() + self.intermediate_dense = operations.Linear(embed_dim, int(embed_dim * mlp_ratio), device=device, dtype=dtype) + self.output_dense = operations.Linear(int(embed_dim * mlp_ratio), embed_dim, device=device, dtype=dtype) + + def forward(self, x): + x = self.intermediate_dense(x) + x = torch.nn.functional.gelu(x) + x = self.output_dense(x) + return x + + +class TransformerEncoderLayer(nn.Module): + def __init__( + self, + embed_dim=768, + num_heads=12, + mlp_ratio=4.0, + dtype=None, device=None, operations=None + ): + super().__init__() + + self.attention = Attention(embed_dim, num_heads, device=device, dtype=dtype, operations=operations) + + self.layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype) + self.feed_forward = FeedForward(embed_dim, mlp_ratio, device=device, dtype=dtype, operations=operations) + self.final_layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype) + + def forward(self, x, mask=None): + residual = x + x = self.layer_norm(x) + x = self.attention(x, mask=mask) + x = residual + x + + x = x + self.feed_forward(self.final_layer_norm(x)) + return x + + +class Wav2Vec2Model(nn.Module): + """Complete Wav2Vec 2.0 model.""" + + def __init__( + self, + embed_dim=1024, + final_dim=256, + num_heads=16, + num_layers=24, + dtype=None, device=None, operations=None + ): + super().__init__() + + conv_dim = 512 + self.feature_extractor = ConvFeatureEncoder(conv_dim, device=device, dtype=dtype, operations=operations) + self.feature_projection = FeatureProjection(conv_dim, embed_dim, device=device, dtype=dtype, operations=operations) + + self.masked_spec_embed = nn.Parameter(torch.empty(embed_dim, device=device, dtype=dtype)) + + self.encoder = TransformerEncoder( + embed_dim=embed_dim, + num_heads=num_heads, + num_layers=num_layers, + device=device, dtype=dtype, operations=operations + ) + + def forward(self, x, mask_time_indices=None, return_dict=False): + + x = torch.mean(x, dim=1) + + x = (x - x.mean()) / torch.sqrt(x.var() + 1e-7) + + features = self.feature_extractor(x) + features = self.feature_projection(features) + + batch_size, seq_len, _ = features.shape + + x, all_x = self.encoder(features) + + return x, all_x diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index a3a21facc..5cb474459 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -730,6 +730,14 @@ class AnyType(ComfyTypeIO): class MODEL_PATCH(ComfyTypeIO): Type = Any +@comfytype(io_type="AUDIO_ENCODER") +class AUDIO_ENCODER(ComfyTypeIO): + Type = Any + +@comfytype(io_type="AUDIO_ENCODER_OUTPUT") +class AUDIO_ENCODER_OUTPUT(ComfyTypeIO): + Type = Any + @comfytype(io_type="COMFY_MULTITYPED_V3") class MultiType: Type = Any diff --git a/comfy_extras/nodes_audio_encoder.py b/comfy_extras/nodes_audio_encoder.py new file mode 100644 index 000000000..39a140fef --- /dev/null +++ b/comfy_extras/nodes_audio_encoder.py @@ -0,0 +1,44 @@ +import folder_paths +import comfy.audio_encoders.audio_encoders +import comfy.utils + + +class AudioEncoderLoader: + @classmethod + def INPUT_TYPES(s): + return {"required": { "audio_encoder_name": (folder_paths.get_filename_list("audio_encoders"), ), + }} + RETURN_TYPES = ("AUDIO_ENCODER",) + FUNCTION = "load_model" + + CATEGORY = "loaders" + + def load_model(self, audio_encoder_name): + audio_encoder_name = folder_paths.get_full_path_or_raise("audio_encoders", audio_encoder_name) + sd = comfy.utils.load_torch_file(audio_encoder_name, safe_load=True) + audio_encoder = comfy.audio_encoders.audio_encoders.load_audio_encoder_from_sd(sd) + if audio_encoder is None: + raise RuntimeError("ERROR: audio encoder file is invalid and does not contain a valid model.") + return (audio_encoder,) + + +class AudioEncoderEncode: + @classmethod + def INPUT_TYPES(s): + return {"required": { "audio_encoder": ("AUDIO_ENCODER",), + "audio": ("AUDIO",), + }} + RETURN_TYPES = ("AUDIO_ENCODER_OUTPUT",) + FUNCTION = "encode" + + CATEGORY = "conditioning" + + def encode(self, audio_encoder, audio): + output = audio_encoder.encode_audio(audio["waveform"], audio["sample_rate"]) + return (output,) + + +NODE_CLASS_MAPPINGS = { + "AudioEncoderLoader": AudioEncoderLoader, + "AudioEncoderEncode": AudioEncoderEncode, +} diff --git a/nodes.py b/nodes.py index 723ce3384..0aff6b14a 100644 --- a/nodes.py +++ b/nodes.py @@ -2324,6 +2324,7 @@ async def init_builtin_extra_nodes(): "nodes_qwen.py", "nodes_model_patch.py", "nodes_easycache.py", + "nodes_audio_encoder.py", ] import_failed = [] From 39aa06bd5d630e50c88d3be1586d21737c4387c1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 26 Aug 2025 09:50:46 -0700 Subject: [PATCH 192/759] Make AudioEncoderOutput usable in v3 node schema. (#9554) --- comfy_api/latest/_io.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 5cb474459..e0ee943a7 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -731,11 +731,11 @@ class MODEL_PATCH(ComfyTypeIO): Type = Any @comfytype(io_type="AUDIO_ENCODER") -class AUDIO_ENCODER(ComfyTypeIO): +class AudioEncoder(ComfyTypeIO): Type = Any @comfytype(io_type="AUDIO_ENCODER_OUTPUT") -class AUDIO_ENCODER_OUTPUT(ComfyTypeIO): +class AudioEncoderOutput(ComfyTypeIO): Type = Any @comfytype(io_type="COMFY_MULTITYPED_V3") @@ -1592,6 +1592,7 @@ class _IO: Model = Model ClipVision = ClipVision ClipVisionOutput = ClipVisionOutput + AudioEncoderOutput = AudioEncoderOutput StyleModel = StyleModel Gligen = Gligen UpscaleModel = UpscaleModel From 5352abc6d389570455776c457738db54367cd6cb Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 27 Aug 2025 01:33:54 +0800 Subject: [PATCH 193/759] Update template to 0.1.66 (#9557) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 131484ce8..db59bb38c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.10 -comfyui-workflow-templates==0.1.65 +comfyui-workflow-templates==0.1.66 comfyui-embedded-docs==0.2.6 torch torchsde From 47f4db3e84874ca6076e5cdbb345444faec83028 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Tue, 26 Aug 2025 19:20:44 -0700 Subject: [PATCH 194/759] Adding Google Gemini Image API node (#9566) * bigcat88's progress on adding Google Gemini Image node * Made Google Gemini Image node functional * Bump frontend version to get static pricing badge on Gemini Image node --- comfy_api_nodes/apis/gemini_api.py | 19 ++ comfy_api_nodes/nodes_gemini.py | 388 ++++++++++++++++++++++------- requirements.txt | 2 +- 3 files changed, 314 insertions(+), 95 deletions(-) create mode 100644 comfy_api_nodes/apis/gemini_api.py diff --git a/comfy_api_nodes/apis/gemini_api.py b/comfy_api_nodes/apis/gemini_api.py new file mode 100644 index 000000000..138bf035d --- /dev/null +++ b/comfy_api_nodes/apis/gemini_api.py @@ -0,0 +1,19 @@ +from __future__ import annotations + +from typing import List, Optional + +from comfy_api_nodes.apis import GeminiGenerationConfig, GeminiContent, GeminiSafetySetting, GeminiSystemInstructionContent, GeminiTool, GeminiVideoMetadata +from pydantic import BaseModel + + +class GeminiImageGenerationConfig(GeminiGenerationConfig): + responseModalities: Optional[List[str]] = None + + +class GeminiImageGenerateContentRequest(BaseModel): + contents: List[GeminiContent] + generationConfig: Optional[GeminiImageGenerationConfig] = None + safetySettings: Optional[List[GeminiSafetySetting]] = None + systemInstruction: Optional[GeminiSystemInstructionContent] = None + tools: Optional[List[GeminiTool]] = None + videoMetadata: Optional[GeminiVideoMetadata] = None diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 78c402a7a..baa379b75 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -4,11 +4,12 @@ See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/infer """ from __future__ import annotations - import json import time import os import uuid +import base64 +from io import BytesIO from enum import Enum from typing import Optional, Literal @@ -25,6 +26,7 @@ from comfy_api_nodes.apis import ( GeminiPart, GeminiMimeType, ) +from comfy_api_nodes.apis.gemini_api import GeminiImageGenerationConfig, GeminiImageGenerateContentRequest from comfy_api_nodes.apis.client import ( ApiEndpoint, HttpMethod, @@ -35,6 +37,7 @@ from comfy_api_nodes.apinode_utils import ( audio_to_base64_string, video_to_base64_string, tensor_to_base64_string, + bytesio_to_image_tensor, ) @@ -53,6 +56,14 @@ class GeminiModel(str, Enum): gemini_2_5_flash = "gemini-2.5-flash" +class GeminiImageModel(str, Enum): + """ + Gemini Image Model Names allowed by comfy-api + """ + + gemini_2_5_flash_image_preview = "gemini-2.5-flash-image-preview" + + def get_gemini_endpoint( model: GeminiModel, ) -> ApiEndpoint[GeminiGenerateContentRequest, GeminiGenerateContentResponse]: @@ -75,6 +86,135 @@ def get_gemini_endpoint( ) +def get_gemini_image_endpoint( + model: GeminiImageModel, +) -> ApiEndpoint[GeminiGenerateContentRequest, GeminiGenerateContentResponse]: + """ + Get the API endpoint for a given Gemini model. + + Args: + model: The Gemini model to use, either as enum or string value. + + Returns: + ApiEndpoint configured for the specific Gemini model. + """ + if isinstance(model, str): + model = GeminiImageModel(model) + return ApiEndpoint( + path=f"{GEMINI_BASE_ENDPOINT}/{model.value}", + method=HttpMethod.POST, + request_model=GeminiImageGenerateContentRequest, + response_model=GeminiGenerateContentResponse, + ) + + +def create_image_parts(image_input: torch.Tensor) -> list[GeminiPart]: + """ + Convert image tensor input to Gemini API compatible parts. + + Args: + image_input: Batch of image tensors from ComfyUI. + + Returns: + List of GeminiPart objects containing the encoded images. + """ + image_parts: list[GeminiPart] = [] + for image_index in range(image_input.shape[0]): + image_as_b64 = tensor_to_base64_string( + image_input[image_index].unsqueeze(0) + ) + image_parts.append( + GeminiPart( + inlineData=GeminiInlineData( + mimeType=GeminiMimeType.image_png, + data=image_as_b64, + ) + ) + ) + return image_parts + + +def create_text_part(text: str) -> GeminiPart: + """ + Create a text part for the Gemini API request. + + Args: + text: The text content to include in the request. + + Returns: + A GeminiPart object with the text content. + """ + return GeminiPart(text=text) + + +def get_parts_from_response( + response: GeminiGenerateContentResponse +) -> list[GeminiPart]: + """ + Extract all parts from the Gemini API response. + + Args: + response: The API response from Gemini. + + Returns: + List of response parts from the first candidate. + """ + return response.candidates[0].content.parts + + +def get_parts_by_type( + response: GeminiGenerateContentResponse, part_type: Literal["text"] | str +) -> list[GeminiPart]: + """ + Filter response parts by their type. + + Args: + response: The API response from Gemini. + part_type: Type of parts to extract ("text" or a MIME type). + + Returns: + List of response parts matching the requested type. + """ + parts = [] + for part in get_parts_from_response(response): + if part_type == "text" and hasattr(part, "text") and part.text: + parts.append(part) + elif ( + hasattr(part, "inlineData") + and part.inlineData + and part.inlineData.mimeType == part_type + ): + parts.append(part) + # Skip parts that don't match the requested type + return parts + + +def get_text_from_response(response: GeminiGenerateContentResponse) -> str: + """ + Extract and concatenate all text parts from the response. + + Args: + response: The API response from Gemini. + + Returns: + Combined text from all text parts in the response. + """ + parts = get_parts_by_type(response, "text") + return "\n".join([part.text for part in parts]) + + +def get_image_from_response(response: GeminiGenerateContentResponse) -> torch.Tensor: + image_tensors: list[torch.Tensor] = [] + parts = get_parts_by_type(response, "image/png") + for part in parts: + image_data = base64.b64decode(part.inlineData.data) + returned_image = bytesio_to_image_tensor(BytesIO(image_data)) + image_tensors.append(returned_image) + if len(image_tensors) == 0: + return torch.zeros((1,1024,1024,4)) + return torch.cat(image_tensors, dim=0) + + class GeminiNode(ComfyNodeABC): """ Node to generate text responses from a Gemini model. @@ -159,59 +299,6 @@ class GeminiNode(ComfyNodeABC): CATEGORY = "api node/text/Gemini" API_NODE = True - def get_parts_from_response( - self, response: GeminiGenerateContentResponse - ) -> list[GeminiPart]: - """ - Extract all parts from the Gemini API response. - - Args: - response: The API response from Gemini. - - Returns: - List of response parts from the first candidate. - """ - return response.candidates[0].content.parts - - def get_parts_by_type( - self, response: GeminiGenerateContentResponse, part_type: Literal["text"] | str - ) -> list[GeminiPart]: - """ - Filter response parts by their type. - - Args: - response: The API response from Gemini. - part_type: Type of parts to extract ("text" or a MIME type). - - Returns: - List of response parts matching the requested type. - """ - parts = [] - for part in self.get_parts_from_response(response): - if part_type == "text" and hasattr(part, "text") and part.text: - parts.append(part) - elif ( - hasattr(part, "inlineData") - and part.inlineData - and part.inlineData.mimeType == part_type - ): - parts.append(part) - # Skip parts that don't match the requested type - return parts - - def get_text_from_response(self, response: GeminiGenerateContentResponse) -> str: - """ - Extract and concatenate all text parts from the response. - - Args: - response: The API response from Gemini. - - Returns: - Combined text from all text parts in the response. - """ - parts = self.get_parts_by_type(response, "text") - return "\n".join([part.text for part in parts]) - def create_video_parts(self, video_input: IO.VIDEO, **kwargs) -> list[GeminiPart]: """ Convert video input to Gemini API compatible parts. @@ -271,43 +358,6 @@ class GeminiNode(ComfyNodeABC): ) return audio_parts - def create_image_parts(self, image_input: torch.Tensor) -> list[GeminiPart]: - """ - Convert image tensor input to Gemini API compatible parts. - - Args: - image_input: Batch of image tensors from ComfyUI. - - Returns: - List of GeminiPart objects containing the encoded images. - """ - image_parts: list[GeminiPart] = [] - for image_index in range(image_input.shape[0]): - image_as_b64 = tensor_to_base64_string( - image_input[image_index].unsqueeze(0) - ) - image_parts.append( - GeminiPart( - inlineData=GeminiInlineData( - mimeType=GeminiMimeType.image_png, - data=image_as_b64, - ) - ) - ) - return image_parts - - def create_text_part(self, text: str) -> GeminiPart: - """ - Create a text part for the Gemini API request. - - Args: - text: The text content to include in the request. - - Returns: - A GeminiPart object with the text content. - """ - return GeminiPart(text=text) - async def api_call( self, prompt: str, @@ -323,11 +373,11 @@ class GeminiNode(ComfyNodeABC): validate_string(prompt, strip_whitespace=False) # Create parts list with text prompt as the first part - parts: list[GeminiPart] = [self.create_text_part(prompt)] + parts: list[GeminiPart] = [create_text_part(prompt)] # Add other modal parts if images is not None: - image_parts = self.create_image_parts(images) + image_parts = create_image_parts(images) parts.extend(image_parts) if audio is not None: parts.extend(self.create_audio_parts(audio)) @@ -351,7 +401,7 @@ class GeminiNode(ComfyNodeABC): ).execute() # Get result output - output_text = self.get_text_from_response(response) + output_text = get_text_from_response(response) if unique_id and output_text: # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. render_spec = { @@ -462,12 +512,162 @@ class GeminiInputFiles(ComfyNodeABC): return (files,) +class GeminiImage(ComfyNodeABC): + """ + Node to generate text and image responses from a Gemini model. + + This node allows users to interact with Google's Gemini AI models, providing + multimodal inputs (text, images, files) to generate coherent + text and image responses. The node works with the latest Gemini models, handling the + API communication and response parsing. + """ + @classmethod + def INPUT_TYPES(cls) -> InputTypeDict: + return { + "required": { + "prompt": ( + IO.STRING, + { + "multiline": True, + "default": "", + "tooltip": "Text prompt for generation", + }, + ), + "model": ( + IO.COMBO, + { + "tooltip": "The Gemini model to use for generating responses.", + "options": [model.value for model in GeminiImageModel], + "default": GeminiImageModel.gemini_2_5_flash_image_preview.value, + }, + ), + "seed": ( + IO.INT, + { + "default": 42, + "min": 0, + "max": 0xFFFFFFFFFFFFFFFF, + "control_after_generate": True, + "tooltip": "When seed is fixed to a specific value, the model makes a best effort to provide the same response for repeated requests. Deterministic output isn't guaranteed. Also, changing the model or parameter settings, such as the temperature, can cause variations in the response even when you use the same seed value. By default, a random seed value is used.", + }, + ), + }, + "optional": { + "images": ( + IO.IMAGE, + { + "default": None, + "tooltip": "Optional image(s) to use as context for the model. To include multiple images, you can use the Batch Images node.", + }, + ), + "files": ( + "GEMINI_INPUT_FILES", + { + "default": None, + "tooltip": "Optional file(s) to use as context for the model. Accepts inputs from the Gemini Generate Content Input Files node.", + }, + ), + # TODO: later we can add this parameter later + # "n": ( + # IO.INT, + # { + # "default": 1, + # "min": 1, + # "max": 8, + # "step": 1, + # "display": "number", + # "tooltip": "How many images to generate", + # }, + # ), + }, + "hidden": { + "auth_token": "AUTH_TOKEN_COMFY_ORG", + "comfy_api_key": "API_KEY_COMFY_ORG", + "unique_id": "UNIQUE_ID", + }, + } + + RETURN_TYPES = (IO.IMAGE, IO.STRING) + FUNCTION = "api_call" + CATEGORY = "api node/image/Gemini" + DESCRIPTION = "Edit images synchronously via Google API." + API_NODE = True + + async def api_call( + self, + prompt: str, + model: GeminiImageModel, + images: Optional[IO.IMAGE] = None, + files: Optional[list[GeminiPart]] = None, + n=1, + unique_id: Optional[str] = None, + **kwargs, + ): + # Validate inputs + validate_string(prompt, strip_whitespace=True, min_length=1) + # Create parts list with text prompt as the first part + parts: list[GeminiPart] = [create_text_part(prompt)] + + # Add other modal parts + if images is not None: + image_parts = create_image_parts(images) + parts.extend(image_parts) + if files is not None: + parts.extend(files) + + response = await SynchronousOperation( + endpoint=get_gemini_image_endpoint(model), + request=GeminiImageGenerateContentRequest( + contents=[ + GeminiContent( + role="user", + parts=parts, + ), + ], + generationConfig=GeminiImageGenerationConfig( + responseModalities=["TEXT","IMAGE"] + ) + ), + auth_kwargs=kwargs, + ).execute() + + output_image = get_image_from_response(response) + output_text = get_text_from_response(response) + if unique_id and output_text: + # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. + render_spec = { + "node_id": unique_id, + "component": "ChatHistoryWidget", + "props": { + "history": json.dumps( + [ + { + "prompt": prompt, + "response": output_text, + "response_id": str(uuid.uuid4()), + "timestamp": time.time(), + } + ] + ), + }, + } + PromptServer.instance.send_sync( + "display_component", + render_spec, + ) + + output_text = output_text or "Empty response from Gemini model..." + return (output_image, output_text,) + + NODE_CLASS_MAPPINGS = { "GeminiNode": GeminiNode, + "GeminiImageNode": GeminiImage, "GeminiInputFiles": GeminiInputFiles, } NODE_DISPLAY_NAME_MAPPINGS = { "GeminiNode": "Google Gemini", + "GeminiImageNode": "Google Gemini Image", "GeminiInputFiles": "Gemini Input Files", } diff --git a/requirements.txt b/requirements.txt index db59bb38c..174f3d4d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.25.10 +comfyui-frontend-package==1.25.11 comfyui-workflow-templates==0.1.66 comfyui-embedded-docs==0.2.6 torch From 6a193ac557b2b35a6d2ea1916b0b8d5d9ee9b1ba Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 27 Aug 2025 12:10:20 +0800 Subject: [PATCH 195/759] Update template to 0.1.68 (#9569) * Update template to 0.1.67 * Update template to 0.1.68 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 174f3d4d1..93d88859d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.11 -comfyui-workflow-templates==0.1.66 +comfyui-workflow-templates==0.1.68 comfyui-embedded-docs==0.2.6 torch torchsde From 88aee596a30e9b80ca831c42a0ae70e0d22b61ae Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 26 Aug 2025 22:10:34 -0700 Subject: [PATCH 196/759] WIP Wan 2.2 S2V model. (#9568) --- comfy/ldm/wan/model.py | 508 ++++++++++++++++++++++++++++++++++++-- comfy/model_base.py | 23 ++ comfy/model_detection.py | 2 + comfy/supported_models.py | 15 +- comfy_extras/nodes_wan.py | 175 +++++++++++++ 5 files changed, 707 insertions(+), 16 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 1885d9730..dedfb47e2 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -4,7 +4,7 @@ import math import torch import torch.nn as nn -from einops import repeat +from einops import rearrange from comfy.ldm.modules.attention import optimized_attention from comfy.ldm.flux.layers import EmbedND @@ -153,7 +153,10 @@ def repeat_e(e, x): repeats = x.size(1) // e.size(1) if repeats == 1: return e - return torch.repeat_interleave(e, repeats, dim=1) + if repeats * e.size(1) == x.size(1): + return torch.repeat_interleave(e, repeats, dim=1) + else: + return torch.repeat_interleave(e, repeats + 1, dim=1)[:, :x.size(1)] class WanAttentionBlock(nn.Module): @@ -573,6 +576,28 @@ class WanModel(torch.nn.Module): x = self.unpatchify(x, grid_sizes) return x + def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None): + patch_size = self.patch_size + t_len = ((t + (patch_size[0] // 2)) // patch_size[0]) + h_len = ((h + (patch_size[1] // 2)) // patch_size[1]) + w_len = ((w + (patch_size[2] // 2)) // patch_size[2]) + + if steps_t is None: + steps_t = t_len + if steps_h is None: + steps_h = h_len + if steps_w is None: + steps_w = w_len + + img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype) + img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1) + img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1) + img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1) + img_ids = img_ids.reshape(1, -1, img_ids.shape[-1]) + + freqs = self.rope_embedder(img_ids).movedim(1, 2) + return freqs + def forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, **kwargs): return comfy.patcher_extension.WrapperExecutor.new_class_executor( self._forward, @@ -584,26 +609,16 @@ class WanModel(torch.nn.Module): bs, c, t, h, w = x.shape x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size) - patch_size = self.patch_size - t_len = ((t + (patch_size[0] // 2)) // patch_size[0]) - h_len = ((h + (patch_size[1] // 2)) // patch_size[1]) - w_len = ((w + (patch_size[2] // 2)) // patch_size[2]) - + t_len = t if time_dim_concat is not None: time_dim_concat = comfy.ldm.common_dit.pad_to_patch_size(time_dim_concat, self.patch_size) x = torch.cat([x, time_dim_concat], dim=2) - t_len = ((x.shape[2] + (patch_size[0] // 2)) // patch_size[0]) + t_len = x.shape[2] if self.ref_conv is not None and "reference_latent" in kwargs: t_len += 1 - img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype) - img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1) - img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1) - img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).reshape(1, 1, -1) - img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs) - - freqs = self.rope_embedder(img_ids).movedim(1, 2) + freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype) return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options, **kwargs)[:, :, :t, :h, :w] def unpatchify(self, x, grid_sizes): @@ -839,3 +854,466 @@ class CameraWanModel(WanModel): # unpatchify x = self.unpatchify(x, grid_sizes) return x + + +class CausalConv1d(nn.Module): + + def __init__(self, + chan_in, + chan_out, + kernel_size=3, + stride=1, + dilation=1, + pad_mode='replicate', + operations=None, + **kwargs): + super().__init__() + + self.pad_mode = pad_mode + padding = (kernel_size - 1, 0) # T + self.time_causal_padding = padding + + self.conv = operations.Conv1d( + chan_in, + chan_out, + kernel_size, + stride=stride, + dilation=dilation, + **kwargs) + + def forward(self, x): + x = torch.nn.functional.pad(x, self.time_causal_padding, mode=self.pad_mode) + return self.conv(x) + + +class MotionEncoder_tc(nn.Module): + + def __init__(self, + in_dim: int, + hidden_dim: int, + num_heads=int, + need_global=True, + dtype=None, + device=None, + operations=None,): + factory_kwargs = {"dtype": dtype, "device": device} + super().__init__() + + self.num_heads = num_heads + self.need_global = need_global + self.conv1_local = CausalConv1d(in_dim, hidden_dim // 4 * num_heads, 3, stride=1, operations=operations, **factory_kwargs) + if need_global: + self.conv1_global = CausalConv1d( + in_dim, hidden_dim // 4, 3, stride=1, operations=operations, **factory_kwargs) + self.norm1 = operations.LayerNorm( + hidden_dim // 4, + elementwise_affine=False, + eps=1e-6, + **factory_kwargs) + self.act = nn.SiLU() + self.conv2 = CausalConv1d(hidden_dim // 4, hidden_dim // 2, 3, stride=2, operations=operations, **factory_kwargs) + self.conv3 = CausalConv1d(hidden_dim // 2, hidden_dim, 3, stride=2, operations=operations, **factory_kwargs) + + if need_global: + self.final_linear = operations.Linear(hidden_dim, hidden_dim, **factory_kwargs) + + self.norm1 = operations.LayerNorm( + hidden_dim // 4, + elementwise_affine=False, + eps=1e-6, + **factory_kwargs) + + self.norm2 = operations.LayerNorm( + hidden_dim // 2, + elementwise_affine=False, + eps=1e-6, + **factory_kwargs) + + self.norm3 = operations.LayerNorm( + hidden_dim, elementwise_affine=False, eps=1e-6, **factory_kwargs) + + self.padding_tokens = nn.Parameter(torch.empty(1, 1, 1, hidden_dim, **factory_kwargs)) + + def forward(self, x): + x = rearrange(x, 'b t c -> b c t') + x_ori = x.clone() + b, c, t = x.shape + x = self.conv1_local(x) + x = rearrange(x, 'b (n c) t -> (b n) t c', n=self.num_heads) + x = self.norm1(x) + x = self.act(x) + x = rearrange(x, 'b t c -> b c t') + x = self.conv2(x) + x = rearrange(x, 'b c t -> b t c') + x = self.norm2(x) + x = self.act(x) + x = rearrange(x, 'b t c -> b c t') + x = self.conv3(x) + x = rearrange(x, 'b c t -> b t c') + x = self.norm3(x) + x = self.act(x) + x = rearrange(x, '(b n) t c -> b t n c', b=b) + padding = comfy.model_management.cast_to(self.padding_tokens, dtype=x.dtype, device=x.device).repeat(b, x.shape[1], 1, 1) + x = torch.cat([x, padding], dim=-2) + x_local = x.clone() + + if not self.need_global: + return x_local + + x = self.conv1_global(x_ori) + x = rearrange(x, 'b c t -> b t c') + x = self.norm1(x) + x = self.act(x) + x = rearrange(x, 'b t c -> b c t') + x = self.conv2(x) + x = rearrange(x, 'b c t -> b t c') + x = self.norm2(x) + x = self.act(x) + x = rearrange(x, 'b t c -> b c t') + x = self.conv3(x) + x = rearrange(x, 'b c t -> b t c') + x = self.norm3(x) + x = self.act(x) + x = self.final_linear(x) + x = rearrange(x, '(b n) t c -> b t n c', b=b) + + return x, x_local + + +class CausalAudioEncoder(nn.Module): + + def __init__(self, + dim=5120, + num_layers=25, + out_dim=2048, + video_rate=8, + num_token=4, + need_global=False, + dtype=None, + device=None, + operations=None): + super().__init__() + self.encoder = MotionEncoder_tc( + in_dim=dim, + hidden_dim=out_dim, + num_heads=num_token, + need_global=need_global, dtype=dtype, device=device, operations=operations) + weight = torch.empty((1, num_layers, 1, 1), dtype=dtype, device=device) + + self.weights = torch.nn.Parameter(weight) + self.act = torch.nn.SiLU() + + def forward(self, features): + # features B * num_layers * dim * video_length + weights = self.act(comfy.model_management.cast_to(self.weights, dtype=features.dtype, device=features.device)) + weights_sum = weights.sum(dim=1, keepdims=True) + weighted_feat = ((features * weights) / weights_sum).sum( + dim=1) # b dim f + weighted_feat = weighted_feat.permute(0, 2, 1) # b f dim + res = self.encoder(weighted_feat) # b f n dim + return res # b f n dim + + +class AdaLayerNorm(nn.Module): + def __init__(self, embedding_dim, output_dim=None, norm_elementwise_affine=False, norm_eps=1e-5, dtype=None, device=None, operations=None): + super().__init__() + + output_dim = output_dim or embedding_dim * 2 + + self.silu = nn.SiLU() + self.linear = operations.Linear(embedding_dim, output_dim, dtype=dtype, device=device) + self.norm = operations.LayerNorm(output_dim // 2, norm_eps, norm_elementwise_affine, dtype=dtype, device=device) + + def forward(self, x, temb): + temb = self.linear(self.silu(temb)) + shift, scale = temb.chunk(2, dim=1) + shift = shift[:, None, :] + scale = scale[:, None, :] + x = self.norm(x) * (1 + scale) + shift + return x + + +class AudioInjector_WAN(nn.Module): + + def __init__(self, + dim=2048, + num_heads=32, + inject_layer=[0, 27], + root_net=None, + enable_adain=False, + adain_dim=2048, + adain_mode=None, + dtype=None, + device=None, + operations=None): + super().__init__() + self.enable_adain = enable_adain + self.adain_mode = adain_mode + self.injected_block_id = {} + audio_injector_id = 0 + for inject_id in inject_layer: + self.injected_block_id[inject_id] = audio_injector_id + audio_injector_id += 1 + + self.injector = nn.ModuleList([ + WanT2VCrossAttention( + dim=dim, + num_heads=num_heads, + qk_norm=True, operation_settings={"operations": operations, "device": device, "dtype": dtype} + ) for _ in range(audio_injector_id) + ]) + self.injector_pre_norm_feat = nn.ModuleList([ + operations.LayerNorm( + dim, + elementwise_affine=False, + eps=1e-6, dtype=dtype, device=device + ) for _ in range(audio_injector_id) + ]) + self.injector_pre_norm_vec = nn.ModuleList([ + operations.LayerNorm( + dim, + elementwise_affine=False, + eps=1e-6, dtype=dtype, device=device + ) for _ in range(audio_injector_id) + ]) + if enable_adain: + self.injector_adain_layers = nn.ModuleList([ + AdaLayerNorm( + output_dim=dim * 2, embedding_dim=adain_dim, dtype=dtype, device=device, operations=operations) + for _ in range(audio_injector_id) + ]) + if adain_mode != "attn_norm": + self.injector_adain_output_layers = nn.ModuleList( + [operations.Linear(dim, dim, dtype=dtype, device=device) for _ in range(audio_injector_id)]) + + def forward(self, x, block_id, audio_emb, audio_emb_global, seq_len): + audio_attn_id = self.injected_block_id.get(block_id, None) + if audio_attn_id is None: + return x + + num_frames = audio_emb.shape[1] + input_hidden_states = rearrange(x[:, :seq_len], "b (t n) c -> (b t) n c", t=num_frames) + if self.enable_adain and self.adain_mode == "attn_norm": + audio_emb_global = rearrange(audio_emb_global, "b t n c -> (b t) n c") + adain_hidden_states = self.injector_adain_layers[audio_attn_id](input_hidden_states, temb=audio_emb_global[:, 0]) + attn_hidden_states = adain_hidden_states + else: + attn_hidden_states = self.injector_pre_norm_feat[audio_attn_id](input_hidden_states) + audio_emb = rearrange(audio_emb, "b t n c -> (b t) n c", t=num_frames) + attn_audio_emb = audio_emb + residual_out = self.injector[audio_attn_id](x=attn_hidden_states, context=attn_audio_emb) + residual_out = rearrange( + residual_out, "(b t) n c -> b (t n) c", t=num_frames) + x[:, :seq_len] = x[:, :seq_len] + residual_out + return x + + +class FramePackMotioner(nn.Module): + def __init__( + self, + inner_dim=1024, + num_heads=16, # Used to indicate the number of heads in the backbone network; unrelated to this module's design + zip_frame_buckets=[ + 1, 2, 16 + ], # Three numbers representing the number of frames sampled for patch operations from the nearest to the farthest frames + drop_mode="drop", # If not "drop", it will use "padd", meaning padding instead of deletion + dtype=None, + device=None, + operations=None): + super().__init__() + self.proj = operations.Conv3d(16, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2), dtype=dtype, device=device) + self.proj_2x = operations.Conv3d(16, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4), dtype=dtype, device=device) + self.proj_4x = operations.Conv3d(16, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8), dtype=dtype, device=device) + self.zip_frame_buckets = zip_frame_buckets + + self.inner_dim = inner_dim + self.num_heads = num_heads + + self.drop_mode = drop_mode + + def forward(self, motion_latents, rope_embedder, add_last_motion=2): + lat_height, lat_width = motion_latents.shape[3], motion_latents.shape[4] + padd_lat = torch.zeros(motion_latents.shape[0], 16, sum(self.zip_frame_buckets), lat_height, lat_width).to(device=motion_latents.device, dtype=motion_latents.dtype) + overlap_frame = min(padd_lat.shape[2], motion_latents.shape[2]) + if overlap_frame > 0: + padd_lat[:, :, -overlap_frame:] = motion_latents[:, :, -overlap_frame:] + + if add_last_motion < 2 and self.drop_mode != "drop": + zero_end_frame = sum(self.zip_frame_buckets[:len(self.zip_frame_buckets) - add_last_motion - 1]) + padd_lat[:, :, -zero_end_frame:] = 0 + + clean_latents_4x, clean_latents_2x, clean_latents_post = padd_lat[:, :, -sum(self.zip_frame_buckets):, :, :].split(self.zip_frame_buckets[::-1], dim=2) # 16, 2 ,1 + + # patchfy + clean_latents_post = self.proj(clean_latents_post).flatten(2).transpose(1, 2) + clean_latents_2x = self.proj_2x(clean_latents_2x) + l_2x_shape = clean_latents_2x.shape + clean_latents_2x = clean_latents_2x.flatten(2).transpose(1, 2) + clean_latents_4x = self.proj_4x(clean_latents_4x) + l_4x_shape = clean_latents_4x.shape + clean_latents_4x = clean_latents_4x.flatten(2).transpose(1, 2) + + if add_last_motion < 2 and self.drop_mode == "drop": + clean_latents_post = clean_latents_post[:, : + 0] if add_last_motion < 2 else clean_latents_post + clean_latents_2x = clean_latents_2x[:, : + 0] if add_last_motion < 1 else clean_latents_2x + + motion_lat = torch.cat([clean_latents_post, clean_latents_2x, clean_latents_4x], dim=1) + + rope_post = rope_embedder.rope_encode(1, lat_height, lat_width, t_start=-1, device=motion_latents.device, dtype=motion_latents.dtype) + rope_2x = rope_embedder.rope_encode(1, lat_height, lat_width, t_start=-3, steps_h=l_2x_shape[-2], steps_w=l_2x_shape[-1], device=motion_latents.device, dtype=motion_latents.dtype) + rope_4x = rope_embedder.rope_encode(4, lat_height, lat_width, t_start=-19, steps_h=l_4x_shape[-2], steps_w=l_4x_shape[-1], device=motion_latents.device, dtype=motion_latents.dtype) + + rope = torch.cat([rope_post, rope_2x, rope_4x], dim=1) + return motion_lat, rope + + +class WanModel_S2V(WanModel): + def __init__(self, + model_type='s2v', + patch_size=(1, 2, 2), + text_len=512, + in_dim=16, + dim=2048, + ffn_dim=8192, + freq_dim=256, + text_dim=4096, + out_dim=16, + num_heads=16, + num_layers=32, + window_size=(-1, -1), + qk_norm=True, + cross_attn_norm=True, + eps=1e-6, + audio_dim=1024, + num_audio_token=4, + enable_adain=True, + cond_dim=16, + audio_inject_layers=[0, 4, 8, 12, 16, 20, 24, 27, 30, 33, 36, 39], + adain_mode="attn_norm", + framepack_drop_mode="padd", + image_model=None, + device=None, + dtype=None, + operations=None, + ): + + super().__init__(model_type='t2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, image_model=image_model, device=device, dtype=dtype, operations=operations) + + self.trainable_cond_mask = operations.Embedding(3, self.dim, device=device, dtype=dtype) + + self.casual_audio_encoder = CausalAudioEncoder( + dim=audio_dim, + out_dim=self.dim, + num_token=num_audio_token, + need_global=enable_adain, dtype=dtype, device=device, operations=operations) + + if cond_dim > 0: + self.cond_encoder = operations.Conv3d( + cond_dim, + self.dim, + kernel_size=self.patch_size, + stride=self.patch_size, device=device, dtype=dtype) + + self.audio_injector = AudioInjector_WAN( + dim=self.dim, + num_heads=self.num_heads, + inject_layer=audio_inject_layers, + root_net=self, + enable_adain=enable_adain, + adain_dim=self.dim, + adain_mode=adain_mode, + dtype=dtype, device=device, operations=operations + ) + + self.frame_packer = FramePackMotioner( + inner_dim=self.dim, + num_heads=self.num_heads, + zip_frame_buckets=[1, 2, 16], + drop_mode=framepack_drop_mode, + dtype=dtype, device=device, operations=operations) + + def forward_orig( + self, + x, + t, + context, + audio_embed=None, + reference_latent=None, + control_video=None, + reference_motion=None, + clip_fea=None, + freqs=None, + transformer_options={}, + **kwargs, + ): + if audio_embed is not None: + num_embeds = x.shape[-3] * 4 + audio_emb_global, audio_emb = self.casual_audio_encoder(audio_embed[:, :, :, :num_embeds]) + else: + audio_emb = None + + # embeddings + x = self.patch_embedding(x.float()).to(x.dtype) + if control_video is not None: + x = x + self.cond_encoder(control_video) + + if t.ndim == 1: + t = t.unsqueeze(1).repeat(1, x.shape[2]) + + grid_sizes = x.shape[2:] + x = x.flatten(2).transpose(1, 2) + seq_len = x.size(1) + + cond_mask_weight = comfy.model_management.cast_to(self.trainable_cond_mask.weight, dtype=x.dtype, device=x.device).unsqueeze(1).unsqueeze(1) + x = x + cond_mask_weight[0] + + if reference_latent is not None: + ref = self.patch_embedding(reference_latent.float()).to(x.dtype) + ref = ref.flatten(2).transpose(1, 2) + freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=30, device=x.device, dtype=x.dtype) + ref = ref + cond_mask_weight[1] + x = torch.cat([x, ref], dim=1) + freqs = torch.cat([freqs, freqs_ref], dim=1) + t = torch.cat([t, torch.zeros((t.shape[0], reference_latent.shape[-3]), device=t.device, dtype=t.dtype)], dim=1) + + if reference_motion is not None: + motion_encoded, freqs_motion = self.frame_packer(reference_motion, self) + motion_encoded = motion_encoded + cond_mask_weight[2] + x = torch.cat([x, motion_encoded], dim=1) + freqs = torch.cat([freqs, freqs_motion], dim=1) + + t = torch.repeat_interleave(t, 2, dim=1) + t = torch.cat([t, torch.zeros((t.shape[0], 3), device=t.device, dtype=t.dtype)], dim=1) + + # time embeddings + e = self.time_embedding( + sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype)) + e = e.reshape(t.shape[0], -1, e.shape[-1]) + e0 = self.time_projection(e).unflatten(2, (6, self.dim)) + + # context + context = self.text_embedding(context) + + + patches_replace = transformer_options.get("patches_replace", {}) + blocks_replace = patches_replace.get("dit", {}) + for i, block in enumerate(self.blocks): + if ("double_block", i) in blocks_replace: + def block_wrap(args): + out = {} + out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"]) + return out + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap}) + x = out["img"] + else: + x = block(x, e=e0, freqs=freqs, context=context) + if audio_emb is not None: + x = self.audio_injector(x, i, audio_emb, audio_emb_global, seq_len) + # head + x = self.head(x, e) + + # unpatchify + x = self.unpatchify(x, grid_sizes) + return x diff --git a/comfy/model_base.py b/comfy/model_base.py index 6c861b15e..18d55c1c4 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1201,6 +1201,29 @@ class WAN21_Camera(WAN21): out['camera_conditions'] = comfy.conds.CONDRegular(camera_conditions) return out +class WAN22_S2V(WAN21): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel_S2V) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + audio_embed = kwargs.get("audio_embed", None) + if audio_embed is not None: + out['audio_embed'] = comfy.conds.CONDRegular(audio_embed) + + reference_latents = kwargs.get("reference_latents", None) + if reference_latents is not None: + out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])) + + reference_motion = kwargs.get("reference_motion", None) + if reference_motion is not None: + out['reference_motion'] = comfy.conds.CONDRegular(self.process_latent_in(reference_motion)) + + control_video = kwargs.get("control_video", None) + if control_video is not None: + out['control_video'] = comfy.conds.CONDRegular(self.process_latent_in(control_video)) + return out + class WAN22(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 0caff53e0..9f3ab64df 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -368,6 +368,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["model_type"] = "camera" else: dit_config["model_type"] = "camera_2.2" + elif '{}casual_audio_encoder.encoder.final_linear.weight'.format(key_prefix) in state_dict_keys: + dit_config["model_type"] = "s2v" else: if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys: dit_config["model_type"] = "i2v" diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 7ed6dfd69..ce571e6cb 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1072,6 +1072,19 @@ class WAN21_Vace(WAN21_T2V): out = model_base.WAN21_Vace(self, image_to_video=False, device=device) return out +class WAN22_S2V(WAN21_T2V): + unet_config = { + "image_model": "wan2.1", + "model_type": "s2v", + } + + def __init__(self, unet_config): + super().__init__(unet_config) + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.WAN22_S2V(self, device=device) + return out + class WAN22_T2V(WAN21_T2V): unet_config = { "image_model": "wan2.1", @@ -1272,6 +1285,6 @@ class QwenImage(supported_models_base.BASE): return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 0fff02f76..89ff74d85 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -786,6 +786,180 @@ class WanTrackToVideo(io.ComfyNode): return io.NodeOutput(positive, negative, out_latent) +def linear_interpolation(features, input_fps, output_fps, output_len=None): + """ + features: shape=[1, T, 512] + input_fps: fps for audio, f_a + output_fps: fps for video, f_m + output_len: video length + """ + features = features.transpose(1, 2) # [1, 512, T] + seq_len = features.shape[2] / float(input_fps) # T/f_a + if output_len is None: + output_len = int(seq_len * output_fps) # f_m*T/f_a + output_features = torch.nn.functional.interpolate( + features, size=output_len, align_corners=True, + mode='linear') # [1, 512, output_len] + return output_features.transpose(1, 2) # [1, output_len, 512] + + +def get_sample_indices(original_fps, + total_frames, + target_fps, + num_sample, + fixed_start=None): + required_duration = num_sample / target_fps + required_origin_frames = int(np.ceil(required_duration * original_fps)) + if required_duration > total_frames / original_fps: + raise ValueError("required_duration must be less than video length") + + if not fixed_start is None and fixed_start >= 0: + start_frame = fixed_start + else: + max_start = total_frames - required_origin_frames + if max_start < 0: + raise ValueError("video length is too short") + start_frame = np.random.randint(0, max_start + 1) + start_time = start_frame / original_fps + + end_time = start_time + required_duration + time_points = np.linspace(start_time, end_time, num_sample, endpoint=False) + + frame_indices = np.round(np.array(time_points) * original_fps).astype(int) + frame_indices = np.clip(frame_indices, 0, total_frames - 1) + return frame_indices + + +def get_audio_embed_bucket_fps(audio_embed, fps=16, batch_frames=81, m=0, video_rate=30): + num_layers, audio_frame_num, audio_dim = audio_embed.shape + + if num_layers > 1: + return_all_layers = True + else: + return_all_layers = False + + scale = video_rate / fps + + min_batch_num = int(audio_frame_num / (batch_frames * scale)) + 1 + + bucket_num = min_batch_num * batch_frames + padd_audio_num = math.ceil(min_batch_num * batch_frames / fps * video_rate) - audio_frame_num + batch_idx = get_sample_indices( + original_fps=video_rate, + total_frames=audio_frame_num + padd_audio_num, + target_fps=fps, + num_sample=bucket_num, + fixed_start=0) + batch_audio_eb = [] + audio_sample_stride = int(video_rate / fps) + for bi in batch_idx: + if bi < audio_frame_num: + + chosen_idx = list( + range(bi - m * audio_sample_stride, bi + (m + 1) * audio_sample_stride, audio_sample_stride)) + chosen_idx = [0 if c < 0 else c for c in chosen_idx] + chosen_idx = [ + audio_frame_num - 1 if c >= audio_frame_num else c + for c in chosen_idx + ] + + if return_all_layers: + frame_audio_embed = audio_embed[:, chosen_idx].flatten( + start_dim=-2, end_dim=-1) + else: + frame_audio_embed = audio_embed[0][chosen_idx].flatten() + else: + frame_audio_embed = torch.zeros([audio_dim * (2 * m + 1)], device=audio_embed.device) if not return_all_layers \ + else torch.zeros([num_layers, audio_dim * (2 * m + 1)], device=audio_embed.device) + batch_audio_eb.append(frame_audio_embed) + batch_audio_eb = torch.cat([c.unsqueeze(0) for c in batch_audio_eb], dim=0) + + return batch_audio_eb, min_batch_num + + +class WanSoundImageToVideo(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanSoundImageToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=77, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.AudioEncoderOutput.Input("audio_encoder_output", optional=True), + io.Image.Input("ref_image", optional=True), + io.Image.Input("control_video", optional=True), + io.Image.Input("ref_motion", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + is_experimental=True, + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, audio_encoder_output=None, control_video=None, ref_motion=None) -> io.NodeOutput: + latent_t = ((length - 1) // 4) + 1 + if audio_encoder_output is not None: + feat = torch.cat(audio_encoder_output["encoded_audio_all_layers"]) + video_rate = 30 + fps = 16 + feat = linear_interpolation(feat, input_fps=50, output_fps=video_rate) + audio_embed_bucket, num_repeat = get_audio_embed_bucket_fps(feat, fps=fps, batch_frames=latent_t * 4, m=0, video_rate=video_rate) + audio_embed_bucket = audio_embed_bucket.unsqueeze(0) + if len(audio_embed_bucket.shape) == 3: + audio_embed_bucket = audio_embed_bucket.permute(0, 2, 1) + elif len(audio_embed_bucket.shape) == 4: + audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1) + + positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket}) + negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket}) + + if ref_image is not None: + ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + ref_latent = vae.encode(ref_image[:, :, :, :3]) + positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True) + negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True) + + if ref_motion is not None: + if ref_motion.shape[0] > 73: + ref_motion = ref_motion[-73:] + + ref_motion = comfy.utils.common_upscale(ref_motion.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + + if ref_motion.shape[0] < 73: + r = torch.ones([73, height, width, 3]) * 0.5 + r[-ref_motion.shape[0]:] = ref_motion + ref_motion = r + + ref_motion = vae.encode(ref_motion[:, :, :, :3]) + positive = node_helpers.conditioning_set_values(positive, {"reference_motion": ref_motion}) + negative = node_helpers.conditioning_set_values(negative, {"reference_motion": ref_motion}) + + latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + + control_video_out = comfy.latent_formats.Wan21().process_out(torch.zeros_like(latent)) + if control_video is not None: + control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + control_video = vae.encode(control_video[:, :, :, :3]) + control_video_out[:, :, :control_video.shape[2]] = control_video + + # TODO: check if zero is better than none if none provided + positive = node_helpers.conditioning_set_values(positive, {"control_video": control_video_out}) + negative = node_helpers.conditioning_set_values(negative, {"control_video": control_video_out}) + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent) + + class Wan22ImageToVideoLatent(io.ComfyNode): @classmethod def define_schema(cls): @@ -844,6 +1018,7 @@ class WanExtension(ComfyExtension): TrimVideoLatent, WanCameraImageToVideo, WanPhantomSubjectToVideo, + WanSoundImageToVideo, Wan22ImageToVideoLatent, ] From 31a37686d02aeaba8ea827933832be7601b31fac Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 27 Aug 2025 09:44:29 -0700 Subject: [PATCH 197/759] Negative audio in s2v should be zeros. (#9578) --- comfy_extras/nodes_wan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 89ff74d85..312260f00 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -920,7 +920,7 @@ class WanSoundImageToVideo(io.ComfyNode): audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1) positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket}) - negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket}) + negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0}) if ref_image is not None: ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) From b20ba1f27cbd4e1c84cf8ec72b345723de9e7c80 Mon Sep 17 00:00:00 2001 From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> Date: Thu, 28 Aug 2025 00:45:02 +0800 Subject: [PATCH 198/759] Fix #9537 (#9576) --- comfy/weight_adapter/lokr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy/weight_adapter/lokr.py b/comfy/weight_adapter/lokr.py index 49b0be55f..563c835f5 100644 --- a/comfy/weight_adapter/lokr.py +++ b/comfy/weight_adapter/lokr.py @@ -97,6 +97,9 @@ class LoKrAdapter(WeightAdapterBase): (mat1, mat2, alpha, None, None, None, None, None, None) ) + def to_train(self): + return LokrDiff(self.weights) + @classmethod def load( cls, From b5ac6ed7ce73294e0025ffe3b16452d8434b83c7 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 27 Aug 2025 12:26:28 -0700 Subject: [PATCH 199/759] Fixes to make controlnet type models work on qwen edit and kontext. (#9581) --- comfy/ldm/flux/model.py | 4 ++-- comfy/ldm/qwen_image/model.py | 2 +- comfy_extras/nodes_model_patch.py | 8 +++++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 0a77fa097..1344c3a57 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -158,7 +158,7 @@ class Flux(nn.Module): if i < len(control_i): add = control_i[i] if add is not None: - img += add + img[:, :add.shape[1]] += add if img.dtype == torch.float16: img = torch.nan_to_num(img, nan=0.0, posinf=65504, neginf=-65504) @@ -189,7 +189,7 @@ class Flux(nn.Module): if i < len(control_o): add = control_o[i] if add is not None: - img[:, txt.shape[1] :, ...] += add + img[:, txt.shape[1] : txt.shape[1] + add.shape[1], ...] += add img = img[:, txt.shape[1] :, ...] diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 57a458210..04071f31c 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -459,7 +459,7 @@ class QwenImageTransformer2DModel(nn.Module): if i < len(control_i): add = control_i[i] if add is not None: - hidden_states += add + hidden_states[:, :add.shape[1]] += add hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index 3eaada9bc..32c40ced3 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -89,6 +89,7 @@ class DiffSynthCnetPatch: self.strength = strength self.mask = mask self.encoded_image = model_patch.model.process_input_latent_image(self.encode_latent_cond(image)) + self.encoded_image_size = (image.shape[1], image.shape[2]) def encode_latent_cond(self, image): latent_image = self.vae.encode(image) @@ -106,14 +107,15 @@ class DiffSynthCnetPatch: x = kwargs.get("x") img = kwargs.get("img") block_index = kwargs.get("block_index") - if self.encoded_image is None or self.encoded_image.shape[1:] != img.shape[1:]: - spacial_compression = self.vae.spacial_compression_encode() + spacial_compression = self.vae.spacial_compression_encode() + if self.encoded_image is None or self.encoded_image_size != (x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression): image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center") loaded_models = comfy.model_management.loaded_models(only_currently_used=True) self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1))) + self.encoded_image_size = (image_scaled.shape[-2], image_scaled.shape[-1]) comfy.model_management.load_models_gpu(loaded_models) - img = img + (self.model_patch.model.control_block(img, self.encoded_image.to(img.dtype), block_index) * self.strength) + img[:, :self.encoded_image.shape[1]] += (self.model_patch.model.control_block(img[:, :self.encoded_image.shape[1]], self.encoded_image.to(img.dtype), block_index) * self.strength) kwargs['img'] = img return kwargs From 496888fd68813033c260195bf70e4d11181e5454 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 27 Aug 2025 13:06:40 -0700 Subject: [PATCH 200/759] Improve s2v performance when generating videos longer than 120 frames. (#9582) --- comfy/ldm/wan/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index dedfb47e2..e70446c86 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -1255,6 +1255,7 @@ class WanModel_S2V(WanModel): audio_emb = None # embeddings + bs, _, time, height, width = x.shape x = self.patch_embedding(x.float()).to(x.dtype) if control_video is not None: x = x + self.cond_encoder(control_video) @@ -1272,7 +1273,7 @@ class WanModel_S2V(WanModel): if reference_latent is not None: ref = self.patch_embedding(reference_latent.float()).to(x.dtype) ref = ref.flatten(2).transpose(1, 2) - freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=30, device=x.device, dtype=x.dtype) + freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=max(30, time + 9), device=x.device, dtype=x.dtype) ref = ref + cond_mask_weight[1] x = torch.cat([x, ref], dim=1) freqs = torch.cat([freqs, freqs_ref], dim=1) @@ -1296,7 +1297,6 @@ class WanModel_S2V(WanModel): # context context = self.text_embedding(context) - patches_replace = transformer_options.get("patches_replace", {}) blocks_replace = patches_replace.get("dit", {}) for i, block in enumerate(self.blocks): From 491755325cc189d0aa1513b12fac738c87e38de6 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 27 Aug 2025 16:02:42 -0700 Subject: [PATCH 201/759] Better s2v memory estimation. (#9584) --- comfy/ldm/wan/model.py | 2 ++ comfy/model_base.py | 25 +++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index e70446c86..47857dc2b 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -1278,6 +1278,7 @@ class WanModel_S2V(WanModel): x = torch.cat([x, ref], dim=1) freqs = torch.cat([freqs, freqs_ref], dim=1) t = torch.cat([t, torch.zeros((t.shape[0], reference_latent.shape[-3]), device=t.device, dtype=t.dtype)], dim=1) + del ref, freqs_ref if reference_motion is not None: motion_encoded, freqs_motion = self.frame_packer(reference_motion, self) @@ -1287,6 +1288,7 @@ class WanModel_S2V(WanModel): t = torch.repeat_interleave(t, 2, dim=1) t = torch.cat([t, torch.zeros((t.shape[0], 3), device=t.device, dtype=t.dtype)], dim=1) + del motion_encoded, freqs_motion # time embeddings e = self.time_embedding( diff --git a/comfy/model_base.py b/comfy/model_base.py index 18d55c1c4..ce29fdc49 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -150,6 +150,7 @@ class BaseModel(torch.nn.Module): logging.debug("adm {}".format(self.adm_channels)) self.memory_usage_factor = model_config.memory_usage_factor self.memory_usage_factor_conds = () + self.memory_usage_shape_process = {} def apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs): return comfy.patcher_extension.WrapperExecutor.new_class_executor( @@ -350,8 +351,15 @@ class BaseModel(torch.nn.Module): input_shapes = [input_shape] for c in self.memory_usage_factor_conds: shape = cond_shapes.get(c, None) - if shape is not None and len(shape) > 0: - input_shapes += shape + if shape is not None: + if c in self.memory_usage_shape_process: + out = [] + for s in shape: + out.append(self.memory_usage_shape_process[c](s)) + shape = out + + if len(shape) > 0: + input_shapes += shape if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention(): dtype = self.get_dtype() @@ -1204,6 +1212,8 @@ class WAN21_Camera(WAN21): class WAN22_S2V(WAN21): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel_S2V) + self.memory_usage_factor_conds = ("reference_latent", "reference_motion") + self.memory_usage_shape_process = {"reference_motion": lambda shape: [shape[0], shape[1], 1.5, shape[-2], shape[-1]]} def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) @@ -1224,6 +1234,17 @@ class WAN22_S2V(WAN21): out['control_video'] = comfy.conds.CONDRegular(self.process_latent_in(control_video)) return out + def extra_conds_shapes(self, **kwargs): + out = {} + ref_latents = kwargs.get("reference_latents", None) + if ref_latents is not None: + out['reference_latent'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16]) + + reference_motion = kwargs.get("reference_motion", None) + if reference_motion is not None: + out['reference_motion'] = reference_motion.shape + return out + class WAN22(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) From 3aad339b63f03e17dc6ebae035b90afc2fefb627 Mon Sep 17 00:00:00 2001 From: Gangin Park Date: Thu, 28 Aug 2025 08:07:31 +0900 Subject: [PATCH 202/759] Add DPM++ 2M SDE Heun (RES) sampler (#9542) --- comfy/k_diffusion/sampling.py | 15 +++++++++++++++ comfy/samplers.py | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) mode change 100644 => 100755 comfy/samplers.py diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index a2bc492fd..fe6844b17 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -853,6 +853,11 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl return x +@torch.no_grad() +def sample_dpmpp_2m_sde_heun(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='heun'): + return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type) + + @torch.no_grad() def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): """DPM-Solver++(3M) SDE.""" @@ -925,6 +930,16 @@ def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di return sample_dpmpp_3m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler) +@torch.no_grad() +def sample_dpmpp_2m_sde_heun_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='heun'): + if len(sigmas) <= 1: + return x + extra_args = {} if extra_args is None else extra_args + sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max() + noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler + return sample_dpmpp_2m_sde_heun(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type) + + @torch.no_grad() def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'): if len(sigmas) <= 1: diff --git a/comfy/samplers.py b/comfy/samplers.py old mode 100644 new mode 100755 index c7dfef4ea..b3202cec6 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -729,7 +729,7 @@ class Sampler: KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_cfg_pp", "heun", "heunpp2","dpm_2", "dpm_2_ancestral", "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu", - "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm", + "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_2m_sde_heun", "dpmpp_2m_sde_heun_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm", "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "res_multistep_ancestral", "res_multistep_ancestral_cfg_pp", "gradient_estimation", "gradient_estimation_cfg_pp", "er_sde", "seeds_2", "seeds_3", "sa_solver", "sa_solver_pece"] From 38f697d953c3989db67e543795768bf954ae0231 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 27 Aug 2025 19:28:10 -0700 Subject: [PATCH 203/759] Add a LatentConcat node. (#9587) --- comfy_extras/nodes_latent.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py index f33ed1bee..247d886a1 100644 --- a/comfy_extras/nodes_latent.py +++ b/comfy_extras/nodes_latent.py @@ -105,6 +105,38 @@ class LatentInterpolate: samples_out["samples"] = st * (m1 * ratio + m2 * (1.0 - ratio)) return (samples_out,) +class LatentConcat: + @classmethod + def INPUT_TYPES(s): + return {"required": { "samples1": ("LATENT",), "samples2": ("LATENT",), "dim": (["x", "-x", "y", "-y", "t", "-t"], )}} + + RETURN_TYPES = ("LATENT",) + FUNCTION = "op" + + CATEGORY = "latent/advanced" + + def op(self, samples1, samples2, dim): + samples_out = samples1.copy() + + s1 = samples1["samples"] + s2 = samples2["samples"] + s2 = comfy.utils.repeat_to_batch_size(s2, s1.shape[0]) + + if "-" in dim: + c = (s2, s1) + else: + c = (s1, s2) + + if "x" in dim: + dim = -1 + elif "y" in dim: + dim = -2 + elif "t" in dim: + dim = -3 + + samples_out["samples"] = torch.cat(c, dim=dim) + return (samples_out,) + class LatentBatch: @classmethod def INPUT_TYPES(s): @@ -279,6 +311,7 @@ NODE_CLASS_MAPPINGS = { "LatentSubtract": LatentSubtract, "LatentMultiply": LatentMultiply, "LatentInterpolate": LatentInterpolate, + "LatentConcat": LatentConcat, "LatentBatch": LatentBatch, "LatentBatchSeedBehavior": LatentBatchSeedBehavior, "LatentApplyOperation": LatentApplyOperation, From 4aa79dbf2c5118853659fc7f7f8590594ab72417 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 27 Aug 2025 20:08:17 -0700 Subject: [PATCH 204/759] Adjust flux mem usage factor a bit. (#9588) --- comfy/supported_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index ce571e6cb..76260de00 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -700,7 +700,7 @@ class Flux(supported_models_base.BASE): unet_extra_config = {} latent_format = latent_formats.Flux - memory_usage_factor = 2.8 + memory_usage_factor = 3.1 # TODO: debug why flux mem usage is so weird on windows. supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] From 0eb821a7b6612af0fa3aaa8302739788a4bd629e Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 27 Aug 2025 23:09:06 -0400 Subject: [PATCH 205/759] ComfyUI 0.3.53 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 834c3e8c2..d6fdc47fe 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.52" +__version__ = "0.3.53" diff --git a/pyproject.toml b/pyproject.toml index f6e765a81..a71ad2bbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.52" +version = "0.3.53" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From ce0052c087cb1e81ba01e8afbe362bec54eeb665 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 28 Aug 2025 07:37:42 -0700 Subject: [PATCH 206/759] Fix diffsynth controlnet regression. (#9597) --- comfy_extras/nodes_model_patch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index 32c40ced3..65e766b52 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -108,7 +108,7 @@ class DiffSynthCnetPatch: img = kwargs.get("img") block_index = kwargs.get("block_index") spacial_compression = self.vae.spacial_compression_encode() - if self.encoded_image is None or self.encoded_image_size != (x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression): + if self.encoded_image is None or self.encoded_image_size != (x.shape[-2] * spacial_compression, x.shape[-1] * spacial_compression): image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center") loaded_models = comfy.model_management.loaded_models(only_currently_used=True) self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1))) From 00636101771cb373354d6294cc6567deda2635f6 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Thu, 28 Aug 2025 10:44:57 -0400 Subject: [PATCH 207/759] ComfyUI version 0.3.54 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index d6fdc47fe..7034953fd 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.53" +__version__ = "0.3.54" diff --git a/pyproject.toml b/pyproject.toml index a71ad2bbf..9f9ac1e21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.53" +version = "0.3.54" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From edde0b50431e296f61f79205e25cb01f653013a2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 28 Aug 2025 14:59:48 -0700 Subject: [PATCH 208/759] WanSoundImageToVideoExtend node to manually extend s2v video. (#9606) --- comfy_extras/nodes_wan.py | 145 +++++++++++++++++++++++++------------- 1 file changed, 97 insertions(+), 48 deletions(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 312260f00..0a55bd5d0 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -877,6 +877,67 @@ def get_audio_embed_bucket_fps(audio_embed, fps=16, batch_frames=81, m=0, video_ return batch_audio_eb, min_batch_num +def wan_sound_to_video(positive, negative, vae, width, height, length, batch_size, frame_offset=0, ref_image=None, audio_encoder_output=None, control_video=None, ref_motion=None, ref_motion_latent=None): + latent_t = ((length - 1) // 4) + 1 + if audio_encoder_output is not None: + feat = torch.cat(audio_encoder_output["encoded_audio_all_layers"]) + video_rate = 30 + fps = 16 + feat = linear_interpolation(feat, input_fps=50, output_fps=video_rate) + batch_frames = latent_t * 4 + audio_embed_bucket, num_repeat = get_audio_embed_bucket_fps(feat, fps=fps, batch_frames=batch_frames, m=0, video_rate=video_rate) + audio_embed_bucket = audio_embed_bucket.unsqueeze(0) + if len(audio_embed_bucket.shape) == 3: + audio_embed_bucket = audio_embed_bucket.permute(0, 2, 1) + elif len(audio_embed_bucket.shape) == 4: + audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1) + + audio_embed_bucket = audio_embed_bucket[:, :, :, frame_offset:frame_offset + batch_frames] + positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket}) + negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0}) + frame_offset += batch_frames + + if ref_image is not None: + ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + ref_latent = vae.encode(ref_image[:, :, :, :3]) + positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True) + negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True) + + if ref_motion is not None: + if ref_motion.shape[0] > 73: + ref_motion = ref_motion[-73:] + + ref_motion = comfy.utils.common_upscale(ref_motion.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + + if ref_motion.shape[0] < 73: + r = torch.ones([73, height, width, 3]) * 0.5 + r[-ref_motion.shape[0]:] = ref_motion + ref_motion = r + + ref_motion_latent = vae.encode(ref_motion[:, :, :, :3]) + + if ref_motion_latent is not None: + ref_motion_latent = ref_motion_latent[:, :, -19:] + positive = node_helpers.conditioning_set_values(positive, {"reference_motion": ref_motion_latent}) + negative = node_helpers.conditioning_set_values(negative, {"reference_motion": ref_motion_latent}) + + latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + + control_video_out = comfy.latent_formats.Wan21().process_out(torch.zeros_like(latent)) + if control_video is not None: + control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + control_video = vae.encode(control_video[:, :, :, :3]) + control_video_out[:, :, :control_video.shape[2]] = control_video + + # TODO: check if zero is better than none if none provided + positive = node_helpers.conditioning_set_values(positive, {"control_video": control_video_out}) + negative = node_helpers.conditioning_set_values(negative, {"control_video": control_video_out}) + + out_latent = {} + out_latent["samples"] = latent + return positive, negative, out_latent, frame_offset + + class WanSoundImageToVideo(io.ComfyNode): @classmethod def define_schema(cls): @@ -906,57 +967,44 @@ class WanSoundImageToVideo(io.ComfyNode): @classmethod def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, audio_encoder_output=None, control_video=None, ref_motion=None) -> io.NodeOutput: - latent_t = ((length - 1) // 4) + 1 - if audio_encoder_output is not None: - feat = torch.cat(audio_encoder_output["encoded_audio_all_layers"]) - video_rate = 30 - fps = 16 - feat = linear_interpolation(feat, input_fps=50, output_fps=video_rate) - audio_embed_bucket, num_repeat = get_audio_embed_bucket_fps(feat, fps=fps, batch_frames=latent_t * 4, m=0, video_rate=video_rate) - audio_embed_bucket = audio_embed_bucket.unsqueeze(0) - if len(audio_embed_bucket.shape) == 3: - audio_embed_bucket = audio_embed_bucket.permute(0, 2, 1) - elif len(audio_embed_bucket.shape) == 4: - audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1) + positive, negative, out_latent, frame_offset = wan_sound_to_video(positive, negative, vae, width, height, length, batch_size, ref_image=ref_image, audio_encoder_output=audio_encoder_output, + control_video=control_video, ref_motion=ref_motion) + return io.NodeOutput(positive, negative, out_latent) - positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket}) - negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0}) - if ref_image is not None: - ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) - ref_latent = vae.encode(ref_image[:, :, :, :3]) - positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True) - negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True) +class WanSoundImageToVideoExtend(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanSoundImageToVideoExtend", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("length", default=77, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Latent.Input("video_latent"), + io.AudioEncoderOutput.Input("audio_encoder_output", optional=True), + io.Image.Input("ref_image", optional=True), + io.Image.Input("control_video", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + is_experimental=True, + ) - if ref_motion is not None: - if ref_motion.shape[0] > 73: - ref_motion = ref_motion[-73:] - - ref_motion = comfy.utils.common_upscale(ref_motion.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) - - if ref_motion.shape[0] < 73: - r = torch.ones([73, height, width, 3]) * 0.5 - r[-ref_motion.shape[0]:] = ref_motion - ref_motion = r - - ref_motion = vae.encode(ref_motion[:, :, :, :3]) - positive = node_helpers.conditioning_set_values(positive, {"reference_motion": ref_motion}) - negative = node_helpers.conditioning_set_values(negative, {"reference_motion": ref_motion}) - - latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - - control_video_out = comfy.latent_formats.Wan21().process_out(torch.zeros_like(latent)) - if control_video is not None: - control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) - control_video = vae.encode(control_video[:, :, :, :3]) - control_video_out[:, :, :control_video.shape[2]] = control_video - - # TODO: check if zero is better than none if none provided - positive = node_helpers.conditioning_set_values(positive, {"control_video": control_video_out}) - negative = node_helpers.conditioning_set_values(negative, {"control_video": control_video_out}) - - out_latent = {} - out_latent["samples"] = latent + @classmethod + def execute(cls, positive, negative, vae, length, video_latent, ref_image=None, audio_encoder_output=None, control_video=None) -> io.NodeOutput: + video_latent = video_latent["samples"] + width = video_latent.shape[-1] * 8 + height = video_latent.shape[-2] * 8 + batch_size = video_latent.shape[0] + frame_offset = video_latent.shape[-3] * 4 + positive, negative, out_latent, frame_offset = wan_sound_to_video(positive, negative, vae, width, height, length, batch_size, frame_offset=frame_offset, ref_image=ref_image, audio_encoder_output=audio_encoder_output, + control_video=control_video, ref_motion=None, ref_motion_latent=video_latent) return io.NodeOutput(positive, negative, out_latent) @@ -1019,6 +1067,7 @@ class WanExtension(ComfyExtension): WanCameraImageToVideo, WanPhantomSubjectToVideo, WanSoundImageToVideo, + WanSoundImageToVideoExtend, Wan22ImageToVideoLatent, ] From 1c184c29eb2a8f6fdd4e49f27347809090038e3f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 28 Aug 2025 15:34:01 -0700 Subject: [PATCH 209/759] Fix issue with s2v node when extending past audio length. (#9608) --- comfy_extras/nodes_wan.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 0a55bd5d0..2cbc93ceb 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -893,9 +893,10 @@ def wan_sound_to_video(positive, negative, vae, width, height, length, batch_siz audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1) audio_embed_bucket = audio_embed_bucket[:, :, :, frame_offset:frame_offset + batch_frames] - positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket}) - negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0}) - frame_offset += batch_frames + if audio_embed_bucket.shape[3] > 0: + positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket}) + negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0}) + frame_offset += batch_frames if ref_image is not None: ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) From d28b39d93dc498110e28ca32c8f39e6de631aa42 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 28 Aug 2025 16:38:28 -0700 Subject: [PATCH 210/759] Add a LatentCut node to cut latents. (#9609) --- comfy_extras/nodes_latent.py | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py index 247d886a1..0f90cf60c 100644 --- a/comfy_extras/nodes_latent.py +++ b/comfy_extras/nodes_latent.py @@ -1,6 +1,7 @@ import comfy.utils import comfy_extras.nodes_post_processing import torch +import nodes def reshape_latent_to(target_shape, latent, repeat_batch=True): @@ -137,6 +138,41 @@ class LatentConcat: samples_out["samples"] = torch.cat(c, dim=dim) return (samples_out,) +class LatentCut: + @classmethod + def INPUT_TYPES(s): + return {"required": {"samples": ("LATENT",), + "dim": (["x", "y", "t"], ), + "index": ("INT", {"default": 0, "min": -nodes.MAX_RESOLUTION, "max": nodes.MAX_RESOLUTION, "step": 1}), + "amount": ("INT", {"default": 1, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 1})}} + + RETURN_TYPES = ("LATENT",) + FUNCTION = "op" + + CATEGORY = "latent/advanced" + + def op(self, samples, dim, index, amount): + samples_out = samples.copy() + + s1 = samples["samples"] + + if "x" in dim: + dim = s1.ndim - 1 + elif "y" in dim: + dim = s1.ndim - 2 + elif "t" in dim: + dim = s1.ndim - 3 + + if index >= 0: + index = min(index, s1.shape[dim] - 1) + amount = min(s1.shape[dim] - index, amount) + else: + index = max(index, -s1.shape[dim]) + amount = min(-index, amount) + + samples_out["samples"] = torch.narrow(s1, dim, index, amount) + return (samples_out,) + class LatentBatch: @classmethod def INPUT_TYPES(s): @@ -312,6 +348,7 @@ NODE_CLASS_MAPPINGS = { "LatentMultiply": LatentMultiply, "LatentInterpolate": LatentInterpolate, "LatentConcat": LatentConcat, + "LatentCut": LatentCut, "LatentBatch": LatentBatch, "LatentBatchSeedBehavior": LatentBatchSeedBehavior, "LatentApplyOperation": LatentApplyOperation, From e80a14ad5073d9eba175c2d2c768a5ca8e4c63ea Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:13:07 -0700 Subject: [PATCH 211/759] Support wan2.2 5B fun control model. (#9611) Use the Wan22FunControlToVideo node. --- comfy/model_base.py | 15 ++++++--------- comfy_extras/nodes_wan.py | 19 ++++++++++++------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index ce29fdc49..56a6798be 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1110,9 +1110,10 @@ class WAN21(BaseModel): shape_image[1] = extra_channels image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device) else: + latent_dim = self.latent_format.latent_channels image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") - for i in range(0, image.shape[1], 16): - image[:, i: i + 16] = self.process_latent_in(image[:, i: i + 16]) + for i in range(0, image.shape[1], latent_dim): + image[:, i: i + latent_dim] = self.process_latent_in(image[:, i: i + latent_dim]) image = utils.resize_to_batch_size(image, noise.shape[0]) if extra_channels != image.shape[1] + 4: @@ -1245,18 +1246,14 @@ class WAN22_S2V(WAN21): out['reference_motion'] = reference_motion.shape return out -class WAN22(BaseModel): +class WAN22(WAN21): def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): - super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) + super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) self.image_to_video = image_to_video def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) - cross_attn = kwargs.get("cross_attn", None) - if cross_attn is not None: - out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) - - denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None)) + denoise_mask = kwargs.get("denoise_mask", None) if denoise_mask is not None: out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask) return out diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 2cbc93ceb..8c1d36613 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -139,16 +139,21 @@ class Wan22FunControlToVideo(io.ComfyNode): @classmethod def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, start_image=None, control_video=None) -> io.NodeOutput: - latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) + spacial_scale = vae.spacial_compression_encode() + latent_channels = vae.latent_channels + latent = torch.zeros([batch_size, latent_channels, ((length - 1) // 4) + 1, height // spacial_scale, width // spacial_scale], device=comfy.model_management.intermediate_device()) + concat_latent = torch.zeros([batch_size, latent_channels, ((length - 1) // 4) + 1, height // spacial_scale, width // spacial_scale], device=comfy.model_management.intermediate_device()) + if latent_channels == 48: + concat_latent = comfy.latent_formats.Wan22().process_out(concat_latent) + else: + concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) concat_latent = concat_latent.repeat(1, 2, 1, 1, 1) mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1])) if start_image is not None: start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) concat_latent_image = vae.encode(start_image[:, :, :, :3]) - concat_latent[:,16:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + concat_latent[:,latent_channels:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] mask[:, :, :start_image.shape[0] + 3] = 0.0 ref_latent = None @@ -159,11 +164,11 @@ class Wan22FunControlToVideo(io.ComfyNode): if control_video is not None: control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) concat_latent_image = vae.encode(control_video[:, :, :, :3]) - concat_latent[:,:16,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] + concat_latent[:,:latent_channels,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2) - positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16}) - negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16}) + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": latent_channels}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": latent_channels}) if ref_latent is not None: positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True) From c7bb3e2bceaad7accd52c23d22b97a1b6808304b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:46:57 -0700 Subject: [PATCH 212/759] Support the 5B fun inpaint model. (#9614) Use the WanFunInpaintToVideo node without the clip_vision_output. --- comfy_extras/nodes_wan.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 8c1d36613..4f73369f5 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -206,7 +206,8 @@ class WanFirstLastFrameToVideo(io.ComfyNode): @classmethod def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None) -> io.NodeOutput: - latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + spacial_scale = vae.spacial_compression_encode() + latent = torch.zeros([batch_size, vae.latent_channels, ((length - 1) // 4) + 1, height // spacial_scale, width // spacial_scale], device=comfy.model_management.intermediate_device()) if start_image is not None: start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) if end_image is not None: From 15aa9222c4d1fc74f5190d7c7e56ef986d0d7146 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 29 Aug 2025 01:12:00 -0700 Subject: [PATCH 213/759] Trim audio to video when saving video. (#9617) --- comfy_api/latest/_input_impl/video_types.py | 34 ++++++--------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py index 28de9651d..f646504c8 100644 --- a/comfy_api/latest/_input_impl/video_types.py +++ b/comfy_api/latest/_input_impl/video_types.py @@ -8,6 +8,7 @@ import av import io import json import numpy as np +import math import torch from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents @@ -282,8 +283,6 @@ class VideoFromComponents(VideoInput): if self.__components.audio: audio_sample_rate = int(self.__components.audio['sample_rate']) audio_stream = output.add_stream('aac', rate=audio_sample_rate) - audio_stream.sample_rate = audio_sample_rate - audio_stream.format = 'fltp' # Encode video for i, frame in enumerate(self.__components.images): @@ -298,27 +297,12 @@ class VideoFromComponents(VideoInput): output.mux(packet) if audio_stream and self.__components.audio: - # Encode audio - samples_per_frame = int(audio_sample_rate / frame_rate) - num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame - for i in range(num_frames): - start = i * samples_per_frame - end = start + samples_per_frame - # TODO(Feature) - Add support for stereo audio - chunk = ( - self.__components.audio["waveform"][0, 0, start:end] - .unsqueeze(0) - .contiguous() - .numpy() - ) - audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono') - audio_frame.sample_rate = audio_sample_rate - audio_frame.pts = i * samples_per_frame - for packet in audio_stream.encode(audio_frame): - output.mux(packet) - - # Flush audio - for packet in audio_stream.encode(None): - output.mux(packet) - + waveform = self.__components.audio['waveform'] + waveform = waveform[:, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])] + frame = av.AudioFrame.from_ndarray(waveform.movedim(2, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[1] == 1 else 'stereo') + frame.sample_rate = audio_sample_rate + frame.pts = 0 + output.mux(audio_stream.encode(frame)) + # Flush encoder + output.mux(audio_stream.encode(None)) From 2efb2cbc38714074b0a48a9f4d70fa43f41499f4 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Fri, 29 Aug 2025 18:03:25 +0800 Subject: [PATCH 214/759] Update template to 0.1.70 (#9620) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 93d88859d..7f64aacca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.11 -comfyui-workflow-templates==0.1.68 +comfyui-workflow-templates==0.1.70 comfyui-embedded-docs==0.2.6 torch torchsde From a86aaa430183068e2a264495c802c81d05eb350a Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Fri, 29 Aug 2025 05:33:29 -0400 Subject: [PATCH 215/759] ComfyUI v0.3.55 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 7034953fd..36777e285 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.54" +__version__ = "0.3.55" diff --git a/pyproject.toml b/pyproject.toml index 9f9ac1e21..04514b4a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.54" +version = "0.3.55" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 885015eecf649d6e49e1ade68e4475b434517b82 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 29 Aug 2025 20:06:04 -0700 Subject: [PATCH 216/759] Lower ram usage on windows. (#9628) --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index 9b2a33011..b23d50816 100644 --- a/main.py +++ b/main.py @@ -112,6 +112,7 @@ import gc if os.name == "nt": + os.environ['MIMALLOC_PURGE_DELAY'] = '0' logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage()) if __name__ == "__main__": From 4449e147692366ac8b9bd3b8834c771bc81e91ac Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Sat, 30 Aug 2025 06:31:19 -0400 Subject: [PATCH 217/759] ComfyUI version 0.3.56 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 36777e285..e8e039373 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.55" +__version__ = "0.3.56" diff --git a/pyproject.toml b/pyproject.toml index 04514b4a8..cfd5d45ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.55" +version = "0.3.56" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From f949094b3cbc33779dbf8d3fd140028f8044d5c1 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 31 Aug 2025 06:19:21 +0300 Subject: [PATCH 218/759] convert Stable Cascade nodes to V3 schema (#9373) --- comfy_extras/nodes_stable_cascade.py | 165 +++++++++++++++------------ 1 file changed, 93 insertions(+), 72 deletions(-) diff --git a/comfy_extras/nodes_stable_cascade.py b/comfy_extras/nodes_stable_cascade.py index 003403215..04c0b366a 100644 --- a/comfy_extras/nodes_stable_cascade.py +++ b/comfy_extras/nodes_stable_cascade.py @@ -17,55 +17,61 @@ """ import torch -import nodes +from typing_extensions import override + import comfy.utils +import nodes +from comfy_api.latest import ComfyExtension, io -class StableCascade_EmptyLatentImage: - def __init__(self, device="cpu"): - self.device = device +class StableCascade_EmptyLatentImage(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="StableCascade_EmptyLatentImage", + category="latent/stable_cascade", + inputs=[ + io.Int.Input("width", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("height", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("compression", default=42, min=4, max=128, step=1), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(display_name="stage_c"), + io.Latent.Output(display_name="stage_b"), + ], + ) @classmethod - def INPUT_TYPES(s): - return {"required": { - "width": ("INT", {"default": 1024, "min": 256, "max": nodes.MAX_RESOLUTION, "step": 8}), - "height": ("INT", {"default": 1024, "min": 256, "max": nodes.MAX_RESOLUTION, "step": 8}), - "compression": ("INT", {"default": 42, "min": 4, "max": 128, "step": 1}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}) - }} - RETURN_TYPES = ("LATENT", "LATENT") - RETURN_NAMES = ("stage_c", "stage_b") - FUNCTION = "generate" - - CATEGORY = "latent/stable_cascade" - - def generate(self, width, height, compression, batch_size=1): + def execute(cls, width, height, compression, batch_size=1): c_latent = torch.zeros([batch_size, 16, height // compression, width // compression]) b_latent = torch.zeros([batch_size, 4, height // 4, width // 4]) - return ({ + return io.NodeOutput({ "samples": c_latent, }, { "samples": b_latent, }) -class StableCascade_StageC_VAEEncode: - def __init__(self, device="cpu"): - self.device = device + +class StableCascade_StageC_VAEEncode(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="StableCascade_StageC_VAEEncode", + category="latent/stable_cascade", + inputs=[ + io.Image.Input("image"), + io.Vae.Input("vae"), + io.Int.Input("compression", default=42, min=4, max=128, step=1), + ], + outputs=[ + io.Latent.Output(display_name="stage_c"), + io.Latent.Output(display_name="stage_b"), + ], + ) @classmethod - def INPUT_TYPES(s): - return {"required": { - "image": ("IMAGE",), - "vae": ("VAE", ), - "compression": ("INT", {"default": 42, "min": 4, "max": 128, "step": 1}), - }} - RETURN_TYPES = ("LATENT", "LATENT") - RETURN_NAMES = ("stage_c", "stage_b") - FUNCTION = "generate" - - CATEGORY = "latent/stable_cascade" - - def generate(self, image, vae, compression): + def execute(cls, image, vae, compression): width = image.shape[-2] height = image.shape[-3] out_width = (width // compression) * vae.downscale_ratio @@ -75,51 +81,59 @@ class StableCascade_StageC_VAEEncode: c_latent = vae.encode(s[:,:,:,:3]) b_latent = torch.zeros([c_latent.shape[0], 4, (height // 8) * 2, (width // 8) * 2]) - return ({ + return io.NodeOutput({ "samples": c_latent, }, { "samples": b_latent, }) -class StableCascade_StageB_Conditioning: + +class StableCascade_StageB_Conditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "conditioning": ("CONDITIONING",), - "stage_c": ("LATENT",), - }} - RETURN_TYPES = ("CONDITIONING",) + def define_schema(cls): + return io.Schema( + node_id="StableCascade_StageB_Conditioning", + category="conditioning/stable_cascade", + inputs=[ + io.Conditioning.Input("conditioning"), + io.Latent.Input("stage_c"), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - FUNCTION = "set_prior" - - CATEGORY = "conditioning/stable_cascade" - - def set_prior(self, conditioning, stage_c): + @classmethod + def execute(cls, conditioning, stage_c): c = [] for t in conditioning: d = t[1].copy() - d['stable_cascade_prior'] = stage_c['samples'] + d["stable_cascade_prior"] = stage_c["samples"] n = [t[0], d] c.append(n) - return (c, ) + return io.NodeOutput(c) -class StableCascade_SuperResolutionControlnet: - def __init__(self, device="cpu"): - self.device = device + +class StableCascade_SuperResolutionControlnet(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="StableCascade_SuperResolutionControlnet", + category="_for_testing/stable_cascade", + is_experimental=True, + inputs=[ + io.Image.Input("image"), + io.Vae.Input("vae"), + ], + outputs=[ + io.Image.Output(display_name="controlnet_input"), + io.Latent.Output(display_name="stage_c"), + io.Latent.Output(display_name="stage_b"), + ], + ) @classmethod - def INPUT_TYPES(s): - return {"required": { - "image": ("IMAGE",), - "vae": ("VAE", ), - }} - RETURN_TYPES = ("IMAGE", "LATENT", "LATENT") - RETURN_NAMES = ("controlnet_input", "stage_c", "stage_b") - FUNCTION = "generate" - - EXPERIMENTAL = True - CATEGORY = "_for_testing/stable_cascade" - - def generate(self, image, vae): + def execute(cls, image, vae): width = image.shape[-2] height = image.shape[-3] batch_size = image.shape[0] @@ -127,15 +141,22 @@ class StableCascade_SuperResolutionControlnet: c_latent = torch.zeros([batch_size, 16, height // 16, width // 16]) b_latent = torch.zeros([batch_size, 4, height // 2, width // 2]) - return (controlnet_input, { + return io.NodeOutput(controlnet_input, { "samples": c_latent, }, { "samples": b_latent, }) -NODE_CLASS_MAPPINGS = { - "StableCascade_EmptyLatentImage": StableCascade_EmptyLatentImage, - "StableCascade_StageB_Conditioning": StableCascade_StageB_Conditioning, - "StableCascade_StageC_VAEEncode": StableCascade_StageC_VAEEncode, - "StableCascade_SuperResolutionControlnet": StableCascade_SuperResolutionControlnet, -} + +class StableCascadeExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + StableCascade_EmptyLatentImage, + StableCascade_StageB_Conditioning, + StableCascade_StageC_VAEEncode, + StableCascade_SuperResolutionControlnet, + ] + +async def comfy_entrypoint() -> StableCascadeExtension: + return StableCascadeExtension() From fea9ea8268d9fc0f4245f3fdc4a417ab802033e9 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 31 Aug 2025 06:19:54 +0300 Subject: [PATCH 219/759] convert Video nodes to V3 schema (#9489) --- comfy_extras/nodes_video.py | 286 +++++++++++++++++------------------- 1 file changed, 132 insertions(+), 154 deletions(-) diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index 969f888b9..69fabb12e 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -5,52 +5,49 @@ import av import torch import folder_paths import json -from typing import Optional, Literal +from typing import Optional +from typing_extensions import override from fractions import Fraction -from comfy.comfy_types import IO, FileLocator, ComfyNodeABC -from comfy_api.latest import Input, InputImpl, Types +from comfy_api.input import AudioInput, ImageInput, VideoInput +from comfy_api.input_impl import VideoFromComponents, VideoFromFile +from comfy_api.util import VideoCodec, VideoComponents, VideoContainer +from comfy_api.latest import ComfyExtension, io, ui from comfy.cli_args import args -class SaveWEBM: - def __init__(self): - self.output_dir = folder_paths.get_output_directory() - self.type = "output" - self.prefix_append = "" +class SaveWEBM(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SaveWEBM", + category="image/video", + is_experimental=True, + inputs=[ + io.Image.Input("images"), + io.String.Input("filename_prefix", default="ComfyUI"), + io.Combo.Input("codec", options=["vp9", "av1"]), + io.Float.Input("fps", default=24.0, min=0.01, max=1000.0, step=0.01), + io.Float.Input("crf", default=32.0, min=0, max=63.0, step=1, tooltip="Higher crf means lower quality with a smaller file size, lower crf means higher quality higher filesize."), + ], + outputs=[], + hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo], + is_output_node=True, + ) @classmethod - def INPUT_TYPES(s): - return {"required": - {"images": ("IMAGE", ), - "filename_prefix": ("STRING", {"default": "ComfyUI"}), - "codec": (["vp9", "av1"],), - "fps": ("FLOAT", {"default": 24.0, "min": 0.01, "max": 1000.0, "step": 0.01}), - "crf": ("FLOAT", {"default": 32.0, "min": 0, "max": 63.0, "step": 1, "tooltip": "Higher crf means lower quality with a smaller file size, lower crf means higher quality higher filesize."}), - }, - "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, - } - - RETURN_TYPES = () - FUNCTION = "save_images" - - OUTPUT_NODE = True - - CATEGORY = "image/video" - - EXPERIMENTAL = True - - def save_images(self, images, codec, fps, filename_prefix, crf, prompt=None, extra_pnginfo=None): - filename_prefix += self.prefix_append - full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir, images[0].shape[1], images[0].shape[0]) + def execute(cls, images, codec, fps, filename_prefix, crf) -> io.NodeOutput: + full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( + filename_prefix, folder_paths.get_output_directory(), images[0].shape[1], images[0].shape[0] + ) file = f"{filename}_{counter:05}_.webm" container = av.open(os.path.join(full_output_folder, file), mode="w") - if prompt is not None: - container.metadata["prompt"] = json.dumps(prompt) + if cls.hidden.prompt is not None: + container.metadata["prompt"] = json.dumps(cls.hidden.prompt) - if extra_pnginfo is not None: - for x in extra_pnginfo: - container.metadata[x] = json.dumps(extra_pnginfo[x]) + if cls.hidden.extra_pnginfo is not None: + for x in cls.hidden.extra_pnginfo: + container.metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x]) codec_map = {"vp9": "libvpx-vp9", "av1": "libsvtav1"} stream = container.add_stream(codec_map[codec], rate=Fraction(round(fps * 1000), 1000)) @@ -69,63 +66,46 @@ class SaveWEBM: container.mux(stream.encode()) container.close() - results: list[FileLocator] = [{ - "filename": file, - "subfolder": subfolder, - "type": self.type - }] + return io.NodeOutput(ui=ui.PreviewVideo([ui.SavedResult(file, subfolder, io.FolderType.output)])) - return {"ui": {"images": results, "animated": (True,)}} # TODO: frontend side - -class SaveVideo(ComfyNodeABC): - def __init__(self): - self.output_dir = folder_paths.get_output_directory() - self.type: Literal["output"] = "output" - self.prefix_append = "" +class SaveVideo(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SaveVideo", + display_name="Save Video", + category="image/video", + description="Saves the input images to your ComfyUI output directory.", + inputs=[ + io.Video.Input("video", tooltip="The video to save."), + io.String.Input("filename_prefix", default="video/ComfyUI", tooltip="The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."), + io.Combo.Input("format", options=VideoContainer.as_input(), default="auto", tooltip="The format to save the video as."), + io.Combo.Input("codec", options=VideoCodec.as_input(), default="auto", tooltip="The codec to use for the video."), + ], + outputs=[], + hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo], + is_output_node=True, + ) @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "video": (IO.VIDEO, {"tooltip": "The video to save."}), - "filename_prefix": ("STRING", {"default": "video/ComfyUI", "tooltip": "The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."}), - "format": (Types.VideoContainer.as_input(), {"default": "auto", "tooltip": "The format to save the video as."}), - "codec": (Types.VideoCodec.as_input(), {"default": "auto", "tooltip": "The codec to use for the video."}), - }, - "hidden": { - "prompt": "PROMPT", - "extra_pnginfo": "EXTRA_PNGINFO" - }, - } - - RETURN_TYPES = () - FUNCTION = "save_video" - - OUTPUT_NODE = True - - CATEGORY = "image/video" - DESCRIPTION = "Saves the input images to your ComfyUI output directory." - - def save_video(self, video: Input.Video, filename_prefix, format, codec, prompt=None, extra_pnginfo=None): - filename_prefix += self.prefix_append + def execute(cls, video: VideoInput, filename_prefix, format, codec) -> io.NodeOutput: width, height = video.get_dimensions() full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( filename_prefix, - self.output_dir, + folder_paths.get_output_directory(), width, height ) - results: list[FileLocator] = list() saved_metadata = None if not args.disable_metadata: metadata = {} - if extra_pnginfo is not None: - metadata.update(extra_pnginfo) - if prompt is not None: - metadata["prompt"] = prompt + if cls.hidden.extra_pnginfo is not None: + metadata.update(cls.hidden.extra_pnginfo) + if cls.hidden.prompt is not None: + metadata["prompt"] = cls.hidden.prompt if len(metadata) > 0: saved_metadata = metadata - file = f"{filename}_{counter:05}_.{Types.VideoContainer.get_extension(format)}" + file = f"{filename}_{counter:05}_.{VideoContainer.get_extension(format)}" video.save_to( os.path.join(full_output_folder, file), format=format, @@ -133,83 +113,82 @@ class SaveVideo(ComfyNodeABC): metadata=saved_metadata ) - results.append({ - "filename": file, - "subfolder": subfolder, - "type": self.type - }) - counter += 1 + return io.NodeOutput(ui=ui.PreviewVideo([ui.SavedResult(file, subfolder, io.FolderType.output)])) - return { "ui": { "images": results, "animated": (True,) } } -class CreateVideo(ComfyNodeABC): +class CreateVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "images": (IO.IMAGE, {"tooltip": "The images to create a video from."}), - "fps": ("FLOAT", {"default": 30.0, "min": 1.0, "max": 120.0, "step": 1.0}), - }, - "optional": { - "audio": (IO.AUDIO, {"tooltip": "The audio to add to the video."}), - } - } + def define_schema(cls): + return io.Schema( + node_id="CreateVideo", + display_name="Create Video", + category="image/video", + description="Create a video from images.", + inputs=[ + io.Image.Input("images", tooltip="The images to create a video from."), + io.Float.Input("fps", default=30.0, min=1.0, max=120.0, step=1.0), + io.Audio.Input("audio", optional=True, tooltip="The audio to add to the video."), + ], + outputs=[ + io.Video.Output(), + ], + ) - RETURN_TYPES = (IO.VIDEO,) - FUNCTION = "create_video" - - CATEGORY = "image/video" - DESCRIPTION = "Create a video from images." - - def create_video(self, images: Input.Image, fps: float, audio: Optional[Input.Audio] = None): - return (InputImpl.VideoFromComponents( - Types.VideoComponents( - images=images, - audio=audio, - frame_rate=Fraction(fps), - ) - ),) - -class GetVideoComponents(ComfyNodeABC): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "video": (IO.VIDEO, {"tooltip": "The video to extract components from."}), - } - } - RETURN_TYPES = (IO.IMAGE, IO.AUDIO, IO.FLOAT) - RETURN_NAMES = ("images", "audio", "fps") - FUNCTION = "get_components" + def execute(cls, images: ImageInput, fps: float, audio: Optional[AudioInput] = None) -> io.NodeOutput: + return io.NodeOutput( + VideoFromComponents(VideoComponents(images=images, audio=audio, frame_rate=Fraction(fps))) + ) - CATEGORY = "image/video" - DESCRIPTION = "Extracts all components from a video: frames, audio, and framerate." +class GetVideoComponents(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="GetVideoComponents", + display_name="Get Video Components", + category="image/video", + description="Extracts all components from a video: frames, audio, and framerate.", + inputs=[ + io.Video.Input("video", tooltip="The video to extract components from."), + ], + outputs=[ + io.Image.Output(display_name="images"), + io.Audio.Output(display_name="audio"), + io.Float.Output(display_name="fps"), + ], + ) - def get_components(self, video: Input.Video): + @classmethod + def execute(cls, video: VideoInput) -> io.NodeOutput: components = video.get_components() - return (components.images, components.audio, float(components.frame_rate)) + return io.NodeOutput(components.images, components.audio, float(components.frame_rate)) -class LoadVideo(ComfyNodeABC): +class LoadVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(cls): + def define_schema(cls): input_dir = folder_paths.get_input_directory() files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))] files = folder_paths.filter_files_content_types(files, ["video"]) - return {"required": - {"file": (sorted(files), {"video_upload": True})}, - } - - CATEGORY = "image/video" - - RETURN_TYPES = (IO.VIDEO,) - FUNCTION = "load_video" - def load_video(self, file): - video_path = folder_paths.get_annotated_filepath(file) - return (InputImpl.VideoFromFile(video_path),) + return io.Schema( + node_id="LoadVideo", + display_name="Load Video", + category="image/video", + inputs=[ + io.Combo.Input("file", options=sorted(files), upload=io.UploadType.video), + ], + outputs=[ + io.Video.Output(), + ], + ) @classmethod - def IS_CHANGED(cls, file): + def execute(cls, file) -> io.NodeOutput: + video_path = folder_paths.get_annotated_filepath(file) + return io.NodeOutput(VideoFromFile(video_path)) + + @classmethod + def fingerprint_inputs(s, file): video_path = folder_paths.get_annotated_filepath(file) mod_time = os.path.getmtime(video_path) # Instead of hashing the file, we can just use the modification time to avoid @@ -217,24 +196,23 @@ class LoadVideo(ComfyNodeABC): return mod_time @classmethod - def VALIDATE_INPUTS(cls, file): + def validate_inputs(s, file): if not folder_paths.exists_annotated_filepath(file): return "Invalid video file: {}".format(file) return True -NODE_CLASS_MAPPINGS = { - "SaveWEBM": SaveWEBM, - "SaveVideo": SaveVideo, - "CreateVideo": CreateVideo, - "GetVideoComponents": GetVideoComponents, - "LoadVideo": LoadVideo, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "SaveVideo": "Save Video", - "CreateVideo": "Create Video", - "GetVideoComponents": "Get Video Components", - "LoadVideo": "Load Video", -} +class VideoExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + SaveWEBM, + SaveVideo, + CreateVideo, + GetVideoComponents, + LoadVideo, + ] +async def comfy_entrypoint() -> VideoExtension: + return VideoExtension() From d2c502e629ba948029abc13ef1b456b9f4bbbdaa Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 31 Aug 2025 06:20:17 +0300 Subject: [PATCH 220/759] convert nodes_stability.py to V3 schema (#9497) --- comfy_api_nodes/nodes_stability.py | 678 ++++++++++++++++------------- 1 file changed, 365 insertions(+), 313 deletions(-) diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index 31309d831..e05cb6bb2 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -1,5 +1,8 @@ from inspect import cleandoc -from comfy.comfy_types.node_typing import IO +from typing import Optional +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io as comfy_io from comfy_api_nodes.apis.stability_api import ( StabilityUpscaleConservativeRequest, StabilityUpscaleCreativeRequest, @@ -46,87 +49,94 @@ def get_async_dummy_status(x: StabilityResultsGetResponse): return StabilityPollStatus.in_progress -class StabilityStableImageUltraNode: +class StabilityStableImageUltraNode(comfy_io.ComfyNode): """ Generates images synchronously based on prompt and resolution. """ - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Stability AI" - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "What you wish to see in the output image. A strong, descriptive prompt that clearly defines" + - "What you wish to see in the output image. A strong, descriptive prompt that clearly defines" + + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityStableImageUltraNode", + display_name="Stability AI Stable Image Ultra", + category="api node/image/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines" + "elements, colors, and subjects will lead to better results. " + "To control the weight of a given word use the format `(word:weight)`," + "where `word` is the word you'd like to control the weight of and `weight`" + "is a value between 0 and 1. For example: `The sky was a crisp (blue:0.3) and (green:0.8)`" + - "would convey a sky that was blue and green, but more green than blue." - }, + "would convey a sky that was blue and green, but more green than blue.", ), - "aspect_ratio": ([x.value for x in StabilityAspectRatio], - { - "default": StabilityAspectRatio.ratio_1_1, - "tooltip": "Aspect ratio of generated image.", - }, + comfy_io.Combo.Input( + "aspect_ratio", + options=[x.value for x in StabilityAspectRatio], + default=StabilityAspectRatio.ratio_1_1.value, + tooltip="Aspect ratio of generated image.", ), - "style_preset": (get_stability_style_presets(), - { - "tooltip": "Optional desired style of generated image.", - }, + comfy_io.Combo.Input( + "style_preset", + options=get_stability_style_presets(), + tooltip="Optional desired style of generated image.", ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 4294967294, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967294, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", ), - }, - "optional": { - "image": (IO.IMAGE,), - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "A blurb of text describing what you do not wish to see in the output image. This is an advanced feature." - }, + comfy_io.Image.Input( + "image", + optional=True, ), - "image_denoise": ( - IO.FLOAT, - { - "default": 0.5, - "min": 0.0, - "max": 1.0, - "step": 0.01, - "tooltip": "Denoise of input image; 0.0 yields image identical to input, 1.0 is as if no image was provided at all.", - }, + comfy_io.String.Input( + "negative_prompt", + default="", + tooltip="A blurb of text describing what you do not wish to see in the output image. This is an advanced feature.", + force_input=True, + optional=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } + comfy_io.Float.Input( + "image_denoise", + default=0.5, + min=0.0, + max=1.0, + step=0.01, + tooltip="Denoise of input image; 0.0 yields image identical to input, 1.0 is as if no image was provided at all.", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - async def api_call(self, prompt: str, aspect_ratio: str, style_preset: str, seed: int, - negative_prompt: str=None, image: torch.Tensor = None, image_denoise: float=None, - **kwargs): + @classmethod + async def execute( + cls, + prompt: str, + aspect_ratio: str, + style_preset: str, + seed: int, + image: Optional[torch.Tensor] = None, + negative_prompt: str = "", + image_denoise: Optional[float] = 0.5, + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=False) # prepare image binary if image present image_binary = None @@ -144,6 +154,11 @@ class StabilityStableImageUltraNode: "image": image_binary } + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/stability/v2beta/stable-image/generate/ultra", @@ -161,7 +176,7 @@ class StabilityStableImageUltraNode: ), files=files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -171,95 +186,106 @@ class StabilityStableImageUltraNode: image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return (returned_image,) + return comfy_io.NodeOutput(returned_image) -class StabilityStableImageSD_3_5Node: +class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): """ Generates images synchronously based on prompt and resolution. """ - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Stability AI" + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityStableImageSD_3_5Node", + display_name="Stability AI Stable Diffusion 3.5 Image", + category="api node/image/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results.", + ), + comfy_io.Combo.Input( + "model", + options=[x.value for x in Stability_SD3_5_Model], + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=[x.value for x in StabilityAspectRatio], + default=StabilityAspectRatio.ratio_1_1.value, + tooltip="Aspect ratio of generated image.", + ), + comfy_io.Combo.Input( + "style_preset", + options=get_stability_style_presets(), + tooltip="Optional desired style of generated image.", + ), + comfy_io.Float.Input( + "cfg_scale", + default=4.0, + min=1.0, + max=10.0, + step=0.1, + tooltip="How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt)", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967294, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + ), + comfy_io.Image.Input( + "image", + optional=True, + ), + comfy_io.String.Input( + "negative_prompt", + default="", + tooltip="Keywords of what you do not wish to see in the output image. This is an advanced feature.", + force_input=True, + optional=True, + ), + comfy_io.Float.Input( + "image_denoise", + default=0.5, + min=0.0, + max=1.0, + step=0.01, + tooltip="Denoise of input image; 0.0 yields image identical to input, 1.0 is as if no image was provided at all.", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results." - }, - ), - "model": ([x.value for x in Stability_SD3_5_Model],), - "aspect_ratio": ([x.value for x in StabilityAspectRatio], - { - "default": StabilityAspectRatio.ratio_1_1, - "tooltip": "Aspect ratio of generated image.", - }, - ), - "style_preset": (get_stability_style_presets(), - { - "tooltip": "Optional desired style of generated image.", - }, - ), - "cfg_scale": ( - IO.FLOAT, - { - "default": 4.0, - "min": 1.0, - "max": 10.0, - "step": 0.1, - "tooltip": "How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt)", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 4294967294, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, - ), - }, - "optional": { - "image": (IO.IMAGE,), - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "Keywords of what you do not wish to see in the output image. This is an advanced feature." - }, - ), - "image_denoise": ( - IO.FLOAT, - { - "default": 0.5, - "min": 0.0, - "max": 1.0, - "step": 0.01, - "tooltip": "Denoise of input image; 0.0 yields image identical to input, 1.0 is as if no image was provided at all.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call(self, model: str, prompt: str, aspect_ratio: str, style_preset: str, seed: int, cfg_scale: float, - negative_prompt: str=None, image: torch.Tensor = None, image_denoise: float=None, - **kwargs): + async def execute( + cls, + model: str, + prompt: str, + aspect_ratio: str, + style_preset: str, + seed: int, + cfg_scale: float, + image: Optional[torch.Tensor] = None, + negative_prompt: str = "", + image_denoise: Optional[float] = 0.5, + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=False) # prepare image binary if image present image_binary = None @@ -280,6 +306,11 @@ class StabilityStableImageSD_3_5Node: "image": image_binary } + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/stability/v2beta/stable-image/generate/sd3", @@ -300,7 +331,7 @@ class StabilityStableImageSD_3_5Node: ), files=files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -310,72 +341,75 @@ class StabilityStableImageSD_3_5Node: image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return (returned_image,) + return comfy_io.NodeOutput(returned_image) -class StabilityUpscaleConservativeNode: +class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): """ Upscale image with minimal alterations to 4K resolution. """ - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Stability AI" + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityUpscaleConservativeNode", + display_name="Stability AI Upscale Conservative", + category="api node/image/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("image"), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results.", + ), + comfy_io.Float.Input( + "creativity", + default=0.35, + min=0.2, + max=0.5, + step=0.01, + tooltip="Controls the likelihood of creating additional details not heavily conditioned by the init image.", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967294, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + ), + comfy_io.String.Input( + "negative_prompt", + default="", + tooltip="Keywords of what you do not wish to see in the output image. This is an advanced feature.", + force_input=True, + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results." - }, - ), - "creativity": ( - IO.FLOAT, - { - "default": 0.35, - "min": 0.2, - "max": 0.5, - "step": 0.01, - "tooltip": "Controls the likelihood of creating additional details not heavily conditioned by the init image.", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 4294967294, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, - ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "Keywords of what you do not wish to see in the output image. This is an advanced feature." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call(self, image: torch.Tensor, prompt: str, creativity: float, seed: int, negative_prompt: str=None, - **kwargs): + async def execute( + cls, + image: torch.Tensor, + prompt: str, + creativity: float, + seed: int, + negative_prompt: str = "", + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=False) image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read() @@ -386,6 +420,11 @@ class StabilityUpscaleConservativeNode: "image": image_binary } + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/stability/v2beta/stable-image/upscale/conservative", @@ -401,7 +440,7 @@ class StabilityUpscaleConservativeNode: ), files=files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -411,77 +450,81 @@ class StabilityUpscaleConservativeNode: image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return (returned_image,) + return comfy_io.NodeOutput(returned_image) -class StabilityUpscaleCreativeNode: +class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): """ Upscale image with minimal alterations to 4K resolution. """ - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Stability AI" + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityUpscaleCreativeNode", + display_name="Stability AI Upscale Creative", + category="api node/image/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("image"), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results.", + ), + comfy_io.Float.Input( + "creativity", + default=0.3, + min=0.1, + max=0.5, + step=0.01, + tooltip="Controls the likelihood of creating additional details not heavily conditioned by the init image.", + ), + comfy_io.Combo.Input( + "style_preset", + options=get_stability_style_presets(), + tooltip="Optional desired style of generated image.", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967294, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + ), + comfy_io.String.Input( + "negative_prompt", + default="", + tooltip="Keywords of what you do not wish to see in the output image. This is an advanced feature.", + force_input=True, + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results." - }, - ), - "creativity": ( - IO.FLOAT, - { - "default": 0.3, - "min": 0.1, - "max": 0.5, - "step": 0.01, - "tooltip": "Controls the likelihood of creating additional details not heavily conditioned by the init image.", - }, - ), - "style_preset": (get_stability_style_presets(), - { - "tooltip": "Optional desired style of generated image.", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 4294967294, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, - ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "Keywords of what you do not wish to see in the output image. This is an advanced feature." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call(self, image: torch.Tensor, prompt: str, creativity: float, style_preset: str, seed: int, negative_prompt: str=None, - **kwargs): + async def execute( + cls, + image: torch.Tensor, + prompt: str, + creativity: float, + style_preset: str, + seed: int, + negative_prompt: str = "", + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=False) image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read() @@ -494,6 +537,11 @@ class StabilityUpscaleCreativeNode: "image": image_binary } + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/stability/v2beta/stable-image/upscale/creative", @@ -510,7 +558,7 @@ class StabilityUpscaleCreativeNode: ), files=files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -525,7 +573,8 @@ class StabilityUpscaleCreativeNode: completed_statuses=[StabilityPollStatus.finished], failed_statuses=[StabilityPollStatus.failed], status_extractor=lambda x: get_async_dummy_status(x), - auth_kwargs=kwargs, + auth_kwargs=auth, + node_id=cls.hidden.unique_id, ) response_poll: StabilityResultsGetResponse = await operation.execute() @@ -535,41 +584,48 @@ class StabilityUpscaleCreativeNode: image_data = base64.b64decode(response_poll.result) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return (returned_image,) + return comfy_io.NodeOutput(returned_image) -class StabilityUpscaleFastNode: +class StabilityUpscaleFastNode(comfy_io.ComfyNode): """ Quickly upscales an image via Stability API call to 4x its original size; intended for upscaling low-quality/compressed images. """ - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Stability AI" + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityUpscaleFastNode", + display_name="Stability AI Upscale Fast", + category="api node/image/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("image"), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - }, - "optional": { - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call(self, image: torch.Tensor, **kwargs): + async def execute(cls, image: torch.Tensor) -> comfy_io.NodeOutput: image_binary = tensor_to_bytesio(image, total_pixels=4096*4096).read() files = { "image": image_binary } + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/stability/v2beta/stable-image/upscale/fast", @@ -580,7 +636,7 @@ class StabilityUpscaleFastNode: request=EmptyRequest(), files=files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -590,24 +646,20 @@ class StabilityUpscaleFastNode: image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return (returned_image,) + return comfy_io.NodeOutput(returned_image) -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "StabilityStableImageUltraNode": StabilityStableImageUltraNode, - "StabilityStableImageSD_3_5Node": StabilityStableImageSD_3_5Node, - "StabilityUpscaleConservativeNode": StabilityUpscaleConservativeNode, - "StabilityUpscaleCreativeNode": StabilityUpscaleCreativeNode, - "StabilityUpscaleFastNode": StabilityUpscaleFastNode, -} +class StabilityExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + StabilityStableImageUltraNode, + StabilityStableImageSD_3_5Node, + StabilityUpscaleConservativeNode, + StabilityUpscaleCreativeNode, + StabilityUpscaleFastNode, + ] -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "StabilityStableImageUltraNode": "Stability AI Stable Image Ultra", - "StabilityStableImageSD_3_5Node": "Stability AI Stable Diffusion 3.5 Image", - "StabilityUpscaleConservativeNode": "Stability AI Upscale Conservative", - "StabilityUpscaleCreativeNode": "Stability AI Upscale Creative", - "StabilityUpscaleFastNode": "Stability AI Upscale Fast", -} + +async def comfy_entrypoint() -> StabilityExtension: + return StabilityExtension() From fe442fac2eccd0cc66999b48d3c518623cafe4fc Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 31 Aug 2025 06:21:58 +0300 Subject: [PATCH 221/759] convert Primitive nodes to V3 schema (#9372) --- comfy_extras/nodes_primitive.py | 169 +++++++++++++++++--------------- 1 file changed, 90 insertions(+), 79 deletions(-) diff --git a/comfy_extras/nodes_primitive.py b/comfy_extras/nodes_primitive.py index 1f93f87a7..5a1aeba80 100644 --- a/comfy_extras/nodes_primitive.py +++ b/comfy_extras/nodes_primitive.py @@ -1,98 +1,109 @@ -# Primitive nodes that are evaluated at backend. -from __future__ import annotations - import sys +from typing_extensions import override -from comfy.comfy_types.node_typing import ComfyNodeABC, InputTypeDict, IO +from comfy_api.latest import ComfyExtension, io -class String(ComfyNodeABC): +class String(io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": {"value": (IO.STRING, {})}, - } + def define_schema(cls): + return io.Schema( + node_id="PrimitiveString", + display_name="String", + category="utils/primitive", + inputs=[ + io.String.Input("value"), + ], + outputs=[io.String.Output()], + ) - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/primitive" - - def execute(self, value: str) -> tuple[str]: - return (value,) - - -class StringMultiline(ComfyNodeABC): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": {"value": (IO.STRING, {"multiline": True,},)}, - } - - RETURN_TYPES = (IO.STRING,) - FUNCTION = "execute" - CATEGORY = "utils/primitive" - - def execute(self, value: str) -> tuple[str]: - return (value,) + def execute(cls, value: str) -> io.NodeOutput: + return io.NodeOutput(value) -class Int(ComfyNodeABC): +class StringMultiline(io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": {"value": (IO.INT, {"min": -sys.maxsize, "max": sys.maxsize, "control_after_generate": True})}, - } + def define_schema(cls): + return io.Schema( + node_id="PrimitiveStringMultiline", + display_name="String (Multiline)", + category="utils/primitive", + inputs=[ + io.String.Input("value", multiline=True), + ], + outputs=[io.String.Output()], + ) - RETURN_TYPES = (IO.INT,) - FUNCTION = "execute" - CATEGORY = "utils/primitive" - - def execute(self, value: int) -> tuple[int]: - return (value,) - - -class Float(ComfyNodeABC): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": {"value": (IO.FLOAT, {"min": -sys.maxsize, "max": sys.maxsize})}, - } - - RETURN_TYPES = (IO.FLOAT,) - FUNCTION = "execute" - CATEGORY = "utils/primitive" - - def execute(self, value: float) -> tuple[float]: - return (value,) + def execute(cls, value: str) -> io.NodeOutput: + return io.NodeOutput(value) -class Boolean(ComfyNodeABC): +class Int(io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": {"value": (IO.BOOLEAN, {})}, - } + def define_schema(cls): + return io.Schema( + node_id="PrimitiveInt", + display_name="Int", + category="utils/primitive", + inputs=[ + io.Int.Input("value", min=-sys.maxsize, max=sys.maxsize, control_after_generate=True), + ], + outputs=[io.Int.Output()], + ) - RETURN_TYPES = (IO.BOOLEAN,) - FUNCTION = "execute" - CATEGORY = "utils/primitive" - - def execute(self, value: bool) -> tuple[bool]: - return (value,) + @classmethod + def execute(cls, value: int) -> io.NodeOutput: + return io.NodeOutput(value) -NODE_CLASS_MAPPINGS = { - "PrimitiveString": String, - "PrimitiveStringMultiline": StringMultiline, - "PrimitiveInt": Int, - "PrimitiveFloat": Float, - "PrimitiveBoolean": Boolean, -} +class Float(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="PrimitiveFloat", + display_name="Float", + category="utils/primitive", + inputs=[ + io.Float.Input("value", min=-sys.maxsize, max=sys.maxsize), + ], + outputs=[io.Float.Output()], + ) -NODE_DISPLAY_NAME_MAPPINGS = { - "PrimitiveString": "String", - "PrimitiveStringMultiline": "String (Multiline)", - "PrimitiveInt": "Int", - "PrimitiveFloat": "Float", - "PrimitiveBoolean": "Boolean", -} + @classmethod + def execute(cls, value: float) -> io.NodeOutput: + return io.NodeOutput(value) + + +class Boolean(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="PrimitiveBoolean", + display_name="Boolean", + category="utils/primitive", + inputs=[ + io.Boolean.Input("value"), + ], + outputs=[io.Boolean.Output()], + ) + + @classmethod + def execute(cls, value: bool) -> io.NodeOutput: + return io.NodeOutput(value) + + +class PrimitivesExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + String, + StringMultiline, + Int, + Float, + Boolean, + ] + +async def comfy_entrypoint() -> PrimitivesExtension: + return PrimitivesExtension() From 32a627bf1feadb83abba97906a27978b927abd33 Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Sun, 31 Aug 2025 12:01:45 +0800 Subject: [PATCH 222/759] SEEDS: update noise decomposition and refactor (#9633) - Update the decomposition to reflect interval dependency - Extract phi computations into functions - Use torch.lerp for interpolation --- comfy/k_diffusion/sampling.py | 135 ++++++++++++++++++---------------- 1 file changed, 73 insertions(+), 62 deletions(-) diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index fe6844b17..2d7e09838 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -171,6 +171,16 @@ def offset_first_sigma_for_snr(sigmas, model_sampling, percent_offset=1e-4): return sigmas +def ei_h_phi_1(h: torch.Tensor) -> torch.Tensor: + """Compute the result of h*phi_1(h) in exponential integrator methods.""" + return torch.expm1(h) + + +def ei_h_phi_2(h: torch.Tensor) -> torch.Tensor: + """Compute the result of h*phi_2(h) in exponential integrator methods.""" + return (torch.expm1(h) - h) / h + + @torch.no_grad() def sample_euler(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.): """Implements Algorithm 2 (Euler steps) from Karras et al. (2022).""" @@ -1550,13 +1560,12 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None @torch.no_grad() def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5): """SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 2. - arXiv: https://arxiv.org/abs/2305.14267 + arXiv: https://arxiv.org/abs/2305.14267 (NeurIPS 2023) """ extra_args = {} if extra_args is None else extra_args seed = extra_args.get("seed", None) noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler s_in = x.new_ones([x.shape[0]]) - inject_noise = eta > 0 and s_noise > 0 model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling') @@ -1564,55 +1573,53 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling) sigmas = offset_first_sigma_for_snr(sigmas, model_sampling) + fac = 1 / (2 * r) + for i in trange(len(sigmas) - 1, disable=disable): denoised = model(x, sigmas[i] * s_in, **extra_args) if callback is not None: callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) + if sigmas[i + 1] == 0: x = denoised - else: - lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1]) - h = lambda_t - lambda_s - h_eta = h * (eta + 1) - lambda_s_1 = lambda_s + r * h - fac = 1 / (2 * r) - sigma_s_1 = sigma_fn(lambda_s_1) + continue - # alpha_t = sigma_t * exp(log(alpha_t / sigma_t)) = sigma_t * exp(lambda_t) - alpha_s_1 = sigma_s_1 * lambda_s_1.exp() - alpha_t = sigmas[i + 1] * lambda_t.exp() + lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1]) + h = lambda_t - lambda_s + h_eta = h * (eta + 1) + lambda_s_1 = torch.lerp(lambda_s, lambda_t, r) + sigma_s_1 = sigma_fn(lambda_s_1) - coeff_1, coeff_2 = (-r * h_eta).expm1(), (-h_eta).expm1() - if inject_noise: - # 0 < r < 1 - noise_coeff_1 = (-2 * r * h * eta).expm1().neg().sqrt() - noise_coeff_2 = (-r * h * eta).exp() * (-2 * (1 - r) * h * eta).expm1().neg().sqrt() - noise_1, noise_2 = noise_sampler(sigmas[i], sigma_s_1), noise_sampler(sigma_s_1, sigmas[i + 1]) + alpha_s_1 = sigma_s_1 * lambda_s_1.exp() + alpha_t = sigmas[i + 1] * lambda_t.exp() - # Step 1 - x_2 = sigma_s_1 / sigmas[i] * (-r * h * eta).exp() * x - alpha_s_1 * coeff_1 * denoised - if inject_noise: - x_2 = x_2 + sigma_s_1 * (noise_coeff_1 * noise_1) * s_noise - denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args) + # Step 1 + x_2 = sigma_s_1 / sigmas[i] * (-r * h * eta).exp() * x - alpha_s_1 * ei_h_phi_1(-r * h_eta) * denoised + if inject_noise: + sde_noise = (-2 * r * h * eta).expm1().neg().sqrt() * noise_sampler(sigmas[i], sigma_s_1) + x_2 = x_2 + sde_noise * sigma_s_1 * s_noise + denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args) - # Step 2 - denoised_d = (1 - fac) * denoised + fac * denoised_2 - x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * coeff_2 * denoised_d - if inject_noise: - x = x + sigmas[i + 1] * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise + # Step 2 + denoised_d = torch.lerp(denoised, denoised_2, fac) + x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d + if inject_noise: + segment_factor = (r - 1) * h * eta + sde_noise = sde_noise * segment_factor.exp() + sde_noise = sde_noise + segment_factor.mul(2).expm1().neg().sqrt() * noise_sampler(sigma_s_1, sigmas[i + 1]) + x = x + sde_noise * sigmas[i + 1] * s_noise return x @torch.no_grad() def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r_1=1./3, r_2=2./3): """SEEDS-3 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 3. - arXiv: https://arxiv.org/abs/2305.14267 + arXiv: https://arxiv.org/abs/2305.14267 (NeurIPS 2023) """ extra_args = {} if extra_args is None else extra_args seed = extra_args.get("seed", None) noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler s_in = x.new_ones([x.shape[0]]) - inject_noise = eta > 0 and s_noise > 0 model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling') @@ -1624,45 +1631,49 @@ def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=Non denoised = model(x, sigmas[i] * s_in, **extra_args) if callback is not None: callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) + if sigmas[i + 1] == 0: x = denoised - else: - lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1]) - h = lambda_t - lambda_s - h_eta = h * (eta + 1) - lambda_s_1 = lambda_s + r_1 * h - lambda_s_2 = lambda_s + r_2 * h - sigma_s_1, sigma_s_2 = sigma_fn(lambda_s_1), sigma_fn(lambda_s_2) + continue - # alpha_t = sigma_t * exp(log(alpha_t / sigma_t)) = sigma_t * exp(lambda_t) - alpha_s_1 = sigma_s_1 * lambda_s_1.exp() - alpha_s_2 = sigma_s_2 * lambda_s_2.exp() - alpha_t = sigmas[i + 1] * lambda_t.exp() + lambda_s, lambda_t = lambda_fn(sigmas[i]), lambda_fn(sigmas[i + 1]) + h = lambda_t - lambda_s + h_eta = h * (eta + 1) + lambda_s_1 = torch.lerp(lambda_s, lambda_t, r_1) + lambda_s_2 = torch.lerp(lambda_s, lambda_t, r_2) + sigma_s_1, sigma_s_2 = sigma_fn(lambda_s_1), sigma_fn(lambda_s_2) - coeff_1, coeff_2, coeff_3 = (-r_1 * h_eta).expm1(), (-r_2 * h_eta).expm1(), (-h_eta).expm1() - if inject_noise: - # 0 < r_1 < r_2 < 1 - noise_coeff_1 = (-2 * r_1 * h * eta).expm1().neg().sqrt() - noise_coeff_2 = (-r_1 * h * eta).exp() * (-2 * (r_2 - r_1) * h * eta).expm1().neg().sqrt() - noise_coeff_3 = (-r_2 * h * eta).exp() * (-2 * (1 - r_2) * h * eta).expm1().neg().sqrt() - noise_1, noise_2, noise_3 = noise_sampler(sigmas[i], sigma_s_1), noise_sampler(sigma_s_1, sigma_s_2), noise_sampler(sigma_s_2, sigmas[i + 1]) + alpha_s_1 = sigma_s_1 * lambda_s_1.exp() + alpha_s_2 = sigma_s_2 * lambda_s_2.exp() + alpha_t = sigmas[i + 1] * lambda_t.exp() - # Step 1 - x_2 = sigma_s_1 / sigmas[i] * (-r_1 * h * eta).exp() * x - alpha_s_1 * coeff_1 * denoised - if inject_noise: - x_2 = x_2 + sigma_s_1 * (noise_coeff_1 * noise_1) * s_noise - denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args) + # Step 1 + x_2 = sigma_s_1 / sigmas[i] * (-r_1 * h * eta).exp() * x - alpha_s_1 * ei_h_phi_1(-r_1 * h_eta) * denoised + if inject_noise: + sde_noise = (-2 * r_1 * h * eta).expm1().neg().sqrt() * noise_sampler(sigmas[i], sigma_s_1) + x_2 = x_2 + sde_noise * sigma_s_1 * s_noise + denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args) - # Step 2 - x_3 = sigma_s_2 / sigmas[i] * (-r_2 * h * eta).exp() * x - alpha_s_2 * coeff_2 * denoised + (r_2 / r_1) * alpha_s_2 * (coeff_2 / (r_2 * h_eta) + 1) * (denoised_2 - denoised) - if inject_noise: - x_3 = x_3 + sigma_s_2 * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise - denoised_3 = model(x_3, sigma_s_2 * s_in, **extra_args) + # Step 2 + a3_2 = r_2 / r_1 * ei_h_phi_2(-r_2 * h_eta) + a3_1 = ei_h_phi_1(-r_2 * h_eta) - a3_2 + x_3 = sigma_s_2 / sigmas[i] * (-r_2 * h * eta).exp() * x - alpha_s_2 * (a3_1 * denoised + a3_2 * denoised_2) + if inject_noise: + segment_factor = (r_1 - r_2) * h * eta + sde_noise = sde_noise * segment_factor.exp() + sde_noise = sde_noise + segment_factor.mul(2).expm1().neg().sqrt() * noise_sampler(sigma_s_1, sigma_s_2) + x_3 = x_3 + sde_noise * sigma_s_2 * s_noise + denoised_3 = model(x_3, sigma_s_2 * s_in, **extra_args) - # Step 3 - x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * coeff_3 * denoised + (1. / r_2) * alpha_t * (coeff_3 / h_eta + 1) * (denoised_3 - denoised) - if inject_noise: - x = x + sigmas[i + 1] * (noise_coeff_3 * noise_1 + noise_coeff_2 * noise_2 + noise_coeff_1 * noise_3) * s_noise + # Step 3 + b3 = ei_h_phi_2(-h_eta) / r_2 + b1 = ei_h_phi_1(-h_eta) - b3 + x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * (b1 * denoised + b3 * denoised_3) + if inject_noise: + segment_factor = (r_2 - 1) * h * eta + sde_noise = sde_noise * segment_factor.exp() + sde_noise = sde_noise + segment_factor.mul(2).expm1().neg().sqrt() * noise_sampler(sigma_s_2, sigmas[i + 1]) + x = x + sde_noise * sigmas[i + 1] * s_noise return x From 9b151559721ff6c8d93150f3d8a53259a23911cd Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 30 Aug 2025 22:32:10 -0700 Subject: [PATCH 223/759] Probably not necessary anymore. (#9646) --- main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/main.py b/main.py index b23d50816..c33f0e17b 100644 --- a/main.py +++ b/main.py @@ -113,7 +113,6 @@ import gc if os.name == "nt": os.environ['MIMALLOC_PURGE_DELAY'] = '0' - logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage()) if __name__ == "__main__": if args.default_device is not None: From 27e067ce505c102fd0f2be0f1242016c59a6816f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 1 Sep 2025 15:54:02 -0700 Subject: [PATCH 224/759] Implement the USO subject identity lora. (#9674) Use the lora with FluxContextMultiReferenceLatentMethod node set to "uso" and a ReferenceLatent node with the reference image. --- comfy/ldm/flux/model.py | 10 ++++++++-- comfy/lora.py | 4 ++++ comfy/lora_convert.py | 19 +++++++++++++++++++ comfy_extras/nodes_flux.py | 2 +- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 1344c3a57..1e62f4626 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -233,12 +233,18 @@ class Flux(nn.Module): h = 0 w = 0 index = 0 - index_ref_method = kwargs.get("ref_latents_method", "offset") == "index" + ref_latents_method = kwargs.get("ref_latents_method", "offset") for ref in ref_latents: - if index_ref_method: + if ref_latents_method == "index": index += 1 h_offset = 0 w_offset = 0 + elif ref_latents_method == "uso": + index = 0 + h_offset = h_len * patch_size + h + w_offset = w_len * patch_size + w + h += ref.shape[-2] + w += ref.shape[-1] else: index = 1 h_offset = 0 diff --git a/comfy/lora.py b/comfy/lora.py index 00358884b..4a44f1318 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -260,6 +260,10 @@ def model_lora_keys_unet(model, key_map={}): key_map["transformer.{}".format(k[:-len(".weight")])] = to #simpletrainer and probably regular diffusers flux lora format key_map["lycoris_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #simpletrainer lycoris key_map["lora_transformer_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #onetrainer + for k in sdk: + hidden_size = model.model_config.unet_config.get("hidden_size", 0) + if k.endswith(".weight") and ".linear1." in k: + key_map["{}".format(k.replace(".linear1.weight", ".linear1_qkv"))] = (k, (0, 0, hidden_size * 3)) if isinstance(model, comfy.model_base.GenmoMochi): for k in sdk: diff --git a/comfy/lora_convert.py b/comfy/lora_convert.py index 3e00b63db..9d8d21efe 100644 --- a/comfy/lora_convert.py +++ b/comfy/lora_convert.py @@ -15,10 +15,29 @@ def convert_lora_bfl_control(sd): #BFL loras for Flux def convert_lora_wan_fun(sd): #Wan Fun loras return comfy.utils.state_dict_prefix_replace(sd, {"lora_unet__": "lora_unet_"}) +def convert_uso_lora(sd): + sd_out = {} + for k in sd: + tensor = sd[k] + k_to = "diffusion_model.{}".format(k.replace(".down.weight", ".lora_down.weight") + .replace(".up.weight", ".lora_up.weight") + .replace(".qkv_lora2.", ".txt_attn.qkv.") + .replace(".qkv_lora1.", ".img_attn.qkv.") + .replace(".proj_lora1.", ".img_attn.proj.") + .replace(".proj_lora2.", ".txt_attn.proj.") + .replace(".qkv_lora.", ".linear1_qkv.") + .replace(".proj_lora.", ".linear2.") + .replace(".processor.", ".") + ) + sd_out[k_to] = tensor + return sd_out + def convert_lora(sd): if "img_in.lora_A.weight" in sd and "single_blocks.0.norm.key_norm.scale" in sd: return convert_lora_bfl_control(sd) if "lora_unet__blocks_0_cross_attn_k.lora_down.weight" in sd: return convert_lora_wan_fun(sd) + if "single_blocks.37.processor.qkv_lora.up.weight" in sd and "double_blocks.18.processor.qkv_lora2.up.weight" in sd: + return convert_uso_lora(sd) return sd diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py index c8db75bb3..1bf7ddd92 100644 --- a/comfy_extras/nodes_flux.py +++ b/comfy_extras/nodes_flux.py @@ -105,7 +105,7 @@ class FluxKontextMultiReferenceLatentMethod: def INPUT_TYPES(s): return {"required": { "conditioning": ("CONDITIONING", ), - "reference_latents_method": (("offset", "index"), ), + "reference_latents_method": (("offset", "index", "uso"), ), }} RETURN_TYPES = ("CONDITIONING",) From e2d1e5dad98dbbcf505703ea8663f20101e6570a Mon Sep 17 00:00:00 2001 From: contentis Date: Tue, 2 Sep 2025 02:33:50 +0200 Subject: [PATCH 225/759] Enable Convolution AutoTuning (#9301) --- comfy/cli_args.py | 1 + comfy/ops.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index de3e85c08..72eeaea9a 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -143,6 +143,7 @@ class PerformanceFeature(enum.Enum): Fp16Accumulation = "fp16_accumulation" Fp8MatrixMultiplication = "fp8_matrix_mult" CublasOps = "cublas_ops" + AutoTune = "autotune" parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops") diff --git a/comfy/ops.py b/comfy/ops.py index 18e7db705..55e958adb 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -52,6 +52,9 @@ except (ModuleNotFoundError, TypeError): cast_to = comfy.model_management.cast_to #TODO: remove once no more references +if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast: + torch.backends.cudnn.benchmark = True + def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) From 3412d53b1d69e4dfedf7e86c3092cea085094053 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 2 Sep 2025 12:36:22 -0700 Subject: [PATCH 226/759] USO style reference. (#9677) Load the projector.safetensors file with the ModelPatchLoader node and use the siglip_vision_patch14_384.safetensors "clip vision" model and the USOStyleReferenceNode. --- comfy/clip_model.py | 12 +- comfy/clip_vision.py | 18 ++- comfy/ldm/flux/model.py | 11 +- comfy/model_patcher.py | 3 + comfy_extras/nodes_model_patch.py | 186 +++++++++++++++++++++++++++++- 5 files changed, 222 insertions(+), 8 deletions(-) diff --git a/comfy/clip_model.py b/comfy/clip_model.py index 7e47d8a55..7c0cadab5 100644 --- a/comfy/clip_model.py +++ b/comfy/clip_model.py @@ -61,8 +61,12 @@ class CLIPEncoder(torch.nn.Module): def forward(self, x, mask=None, intermediate_output=None): optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True) + all_intermediate = None if intermediate_output is not None: - if intermediate_output < 0: + if intermediate_output == "all": + all_intermediate = [] + intermediate_output = None + elif intermediate_output < 0: intermediate_output = len(self.layers) + intermediate_output intermediate = None @@ -70,6 +74,12 @@ class CLIPEncoder(torch.nn.Module): x = l(x, mask, optimized_attention) if i == intermediate_output: intermediate = x.clone() + if all_intermediate is not None: + all_intermediate.append(x.unsqueeze(1).clone()) + + if all_intermediate is not None: + intermediate = torch.cat(all_intermediate, dim=1) + return x, intermediate class CLIPEmbeddings(torch.nn.Module): diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index 00aab9164..2fa410cb7 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -50,7 +50,13 @@ class ClipVisionModel(): self.image_size = config.get("image_size", 224) self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073]) self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711]) - model_class = IMAGE_ENCODERS.get(config.get("model_type", "clip_vision_model")) + model_type = config.get("model_type", "clip_vision_model") + model_class = IMAGE_ENCODERS.get(model_type) + if model_type == "siglip_vision_model": + self.return_all_hidden_states = True + else: + self.return_all_hidden_states = False + self.load_device = comfy.model_management.text_encoder_device() offload_device = comfy.model_management.text_encoder_offload_device() self.dtype = comfy.model_management.text_encoder_dtype(self.load_device) @@ -68,12 +74,18 @@ class ClipVisionModel(): def encode_image(self, image, crop=True): comfy.model_management.load_model_gpu(self.patcher) pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float() - out = self.model(pixel_values=pixel_values, intermediate_output=-2) + out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2) outputs = Output() outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device()) outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device()) - outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device()) + if self.return_all_hidden_states: + all_hs = out[1].to(comfy.model_management.intermediate_device()) + outputs["penultimate_hidden_states"] = all_hs[:, -2] + outputs["all_hidden_states"] = all_hs + else: + outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device()) + outputs["mm_projected"] = out[3] return outputs diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 1e62f4626..d4be6bb61 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -106,6 +106,7 @@ class Flux(nn.Module): if y is None: y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype) + patches = transformer_options.get("patches", {}) patches_replace = transformer_options.get("patches_replace", {}) if img.ndim != 3 or txt.ndim != 3: raise ValueError("Input img and txt tensors must have 3 dimensions.") @@ -117,9 +118,17 @@ class Flux(nn.Module): if guidance is not None: vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype)) - vec = vec + self.vector_in(y[:,:self.params.vec_in_dim]) + vec = vec + self.vector_in(y[:, :self.params.vec_in_dim]) txt = self.txt_in(txt) + if "post_input" in patches: + for p in patches["post_input"]: + out = p({"img": img, "txt": txt, "img_ids": img_ids, "txt_ids": txt_ids}) + img = out["img"] + txt = out["txt"] + img_ids = out["img_ids"] + txt_ids = out["txt_ids"] + if img_ids is not None: ids = torch.cat((txt_ids, img_ids), dim=1) pe = self.pe_embedder(ids) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index a944cb421..1fd03d9d1 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -433,6 +433,9 @@ class ModelPatcher: def set_model_double_block_patch(self, patch): self.set_model_patch(patch, "double_block") + def set_model_post_input_patch(self, patch): + self.set_model_patch(patch, "post_input") + def add_object_patch(self, name, obj): self.object_patches[name] = obj diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index 65e766b52..783c59b6b 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -1,4 +1,5 @@ import torch +from torch import nn import folder_paths import comfy.utils import comfy.ops @@ -58,6 +59,136 @@ class QwenImageBlockWiseControlNet(torch.nn.Module): return self.controlnet_blocks[block_id](img, controlnet_conditioning) +class SigLIPMultiFeatProjModel(torch.nn.Module): + """ + SigLIP Multi-Feature Projection Model for processing style features from different layers + and projecting them into a unified hidden space. + + Args: + siglip_token_nums (int): Number of SigLIP tokens, default 257 + style_token_nums (int): Number of style tokens, default 256 + siglip_token_dims (int): Dimension of SigLIP tokens, default 1536 + hidden_size (int): Hidden layer size, default 3072 + context_layer_norm (bool): Whether to use context layer normalization, default False + """ + + def __init__( + self, + siglip_token_nums: int = 729, + style_token_nums: int = 64, + siglip_token_dims: int = 1152, + hidden_size: int = 3072, + context_layer_norm: bool = True, + device=None, dtype=None, operations=None + ): + super().__init__() + + # High-level feature processing (layer -2) + self.high_embedding_linear = nn.Sequential( + operations.Linear(siglip_token_nums, style_token_nums), + nn.SiLU() + ) + self.high_layer_norm = ( + operations.LayerNorm(siglip_token_dims) if context_layer_norm else nn.Identity() + ) + self.high_projection = operations.Linear(siglip_token_dims, hidden_size, bias=True) + + # Mid-level feature processing (layer -11) + self.mid_embedding_linear = nn.Sequential( + operations.Linear(siglip_token_nums, style_token_nums), + nn.SiLU() + ) + self.mid_layer_norm = ( + operations.LayerNorm(siglip_token_dims) if context_layer_norm else nn.Identity() + ) + self.mid_projection = operations.Linear(siglip_token_dims, hidden_size, bias=True) + + # Low-level feature processing (layer -20) + self.low_embedding_linear = nn.Sequential( + operations.Linear(siglip_token_nums, style_token_nums), + nn.SiLU() + ) + self.low_layer_norm = ( + operations.LayerNorm(siglip_token_dims) if context_layer_norm else nn.Identity() + ) + self.low_projection = operations.Linear(siglip_token_dims, hidden_size, bias=True) + + def forward(self, siglip_outputs): + """ + Forward pass function + + Args: + siglip_outputs: Output from SigLIP model, containing hidden_states + + Returns: + torch.Tensor: Concatenated multi-layer features with shape [bs, 3*style_token_nums, hidden_size] + """ + dtype = next(self.high_embedding_linear.parameters()).dtype + + # Process high-level features (layer -2) + high_embedding = self._process_layer_features( + siglip_outputs[2], + self.high_embedding_linear, + self.high_layer_norm, + self.high_projection, + dtype + ) + + # Process mid-level features (layer -11) + mid_embedding = self._process_layer_features( + siglip_outputs[1], + self.mid_embedding_linear, + self.mid_layer_norm, + self.mid_projection, + dtype + ) + + # Process low-level features (layer -20) + low_embedding = self._process_layer_features( + siglip_outputs[0], + self.low_embedding_linear, + self.low_layer_norm, + self.low_projection, + dtype + ) + + # Concatenate features from all layersmodel_patch + return torch.cat((high_embedding, mid_embedding, low_embedding), dim=1) + + def _process_layer_features( + self, + hidden_states: torch.Tensor, + embedding_linear: nn.Module, + layer_norm: nn.Module, + projection: nn.Module, + dtype: torch.dtype + ) -> torch.Tensor: + """ + Helper function to process features from a single layer + + Args: + hidden_states: Input hidden states [bs, seq_len, dim] + embedding_linear: Embedding linear layer + layer_norm: Layer normalization + projection: Projection layer + dtype: Target data type + + Returns: + torch.Tensor: Processed features [bs, style_token_nums, hidden_size] + """ + # Transform dimensions: [bs, seq_len, dim] -> [bs, dim, seq_len] -> [bs, dim, style_token_nums] -> [bs, style_token_nums, dim] + embedding = embedding_linear( + hidden_states.to(dtype).transpose(1, 2) + ).transpose(1, 2) + + # Apply layer normalization + embedding = layer_norm(embedding) + + # Project to target hidden space + embedding = projection(embedding) + + return embedding + class ModelPatchLoader: @classmethod def INPUT_TYPES(s): @@ -73,9 +204,14 @@ class ModelPatchLoader: model_patch_path = folder_paths.get_full_path_or_raise("model_patches", name) sd = comfy.utils.load_torch_file(model_patch_path, safe_load=True) dtype = comfy.utils.weight_dtype(sd) - # TODO: this node will work with more types of model patches - additional_in_dim = sd["img_in.weight"].shape[1] - 64 - model = QwenImageBlockWiseControlNet(additional_in_dim=additional_in_dim, device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + + if 'controlnet_blocks.0.y_rms.weight' in sd: + additional_in_dim = sd["img_in.weight"].shape[1] - 64 + model = QwenImageBlockWiseControlNet(additional_in_dim=additional_in_dim, device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + elif 'feature_embedder.mid_layer_norm.bias' in sd: + sd = comfy.utils.state_dict_prefix_replace(sd, {"feature_embedder.": ""}, filter_keys=True) + model = SigLIPMultiFeatProjModel(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + model.load_state_dict(sd) model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device()) return (model,) @@ -157,7 +293,51 @@ class QwenImageDiffsynthControlnet: return (model_patched,) +class UsoStyleProjectorPatch: + def __init__(self, model_patch, encoded_image): + self.model_patch = model_patch + self.encoded_image = encoded_image + + def __call__(self, kwargs): + txt_ids = kwargs.get("txt_ids") + txt = kwargs.get("txt") + siglip_embedding = self.model_patch.model(self.encoded_image.to(txt.dtype)).to(txt.dtype) + txt = torch.cat([siglip_embedding, txt], dim=1) + kwargs['txt'] = txt + kwargs['txt_ids'] = torch.cat([torch.zeros(siglip_embedding.shape[0], siglip_embedding.shape[1], 3, dtype=txt_ids.dtype, device=txt_ids.device), txt_ids], dim=1) + return kwargs + + def to(self, device_or_dtype): + if isinstance(device_or_dtype, torch.device): + self.encoded_image = self.encoded_image.to(device_or_dtype) + return self + + def models(self): + return [self.model_patch] + + +class USOStyleReference: + @classmethod + def INPUT_TYPES(s): + return {"required": {"model": ("MODEL",), + "model_patch": ("MODEL_PATCH",), + "clip_vision_output": ("CLIP_VISION_OUTPUT", ), + }} + RETURN_TYPES = ("MODEL",) + FUNCTION = "apply_patch" + EXPERIMENTAL = True + + CATEGORY = "advanced/model_patches/flux" + + def apply_patch(self, model, model_patch, clip_vision_output): + encoded_image = torch.stack((clip_vision_output.all_hidden_states[:, -20], clip_vision_output.all_hidden_states[:, -11], clip_vision_output.penultimate_hidden_states)) + model_patched = model.clone() + model_patched.set_model_post_input_patch(UsoStyleProjectorPatch(model_patch, encoded_image)) + return (model_patched,) + + NODE_CLASS_MAPPINGS = { "ModelPatchLoader": ModelPatchLoader, "QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet, + "USOStyleReference": USOStyleReference, } From e3018c2a5aeb99f0c5b595621949a451686ce55a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 2 Sep 2025 13:12:07 -0700 Subject: [PATCH 227/759] uso -> uxo/uno as requested. (#9688) --- comfy/ldm/flux/model.py | 2 +- comfy_extras/nodes_flux.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index d4be6bb61..8ea7d4f57 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -248,7 +248,7 @@ class Flux(nn.Module): index += 1 h_offset = 0 w_offset = 0 - elif ref_latents_method == "uso": + elif ref_latents_method == "uxo": index = 0 h_offset = h_len * patch_size + h w_offset = w_len * patch_size + w diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py index 1bf7ddd92..25e029ffd 100644 --- a/comfy_extras/nodes_flux.py +++ b/comfy_extras/nodes_flux.py @@ -105,7 +105,7 @@ class FluxKontextMultiReferenceLatentMethod: def INPUT_TYPES(s): return {"required": { "conditioning": ("CONDITIONING", ), - "reference_latents_method": (("offset", "index", "uso"), ), + "reference_latents_method": (("offset", "index", "uxo/uno"), ), }} RETURN_TYPES = ("CONDITIONING",) @@ -115,6 +115,8 @@ class FluxKontextMultiReferenceLatentMethod: CATEGORY = "advanced/conditioning/flux" def append(self, conditioning, reference_latents_method): + if "uxo" in reference_latents_method or "uso" in reference_latents_method: + reference_latents_method = "uxo" c = node_helpers.conditioning_set_values(conditioning, {"reference_latents_method": reference_latents_method}) return (c, ) From 464ba1d6140eda6a0173703ac00c69f7fddab6ba Mon Sep 17 00:00:00 2001 From: Deep Roy Date: Tue, 2 Sep 2025 19:41:10 -0400 Subject: [PATCH 228/759] Accept prompt_id in interrupt handler (#9607) * Accept prompt_id in interrupt handler * remove a log --- server.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/server.py b/server.py index 8f9c88ebf..3d323eaf8 100644 --- a/server.py +++ b/server.py @@ -729,7 +729,34 @@ class PromptServer(): @routes.post("/interrupt") async def post_interrupt(request): - nodes.interrupt_processing() + try: + json_data = await request.json() + except json.JSONDecodeError: + json_data = {} + + # Check if a specific prompt_id was provided for targeted interruption + prompt_id = json_data.get('prompt_id') + if prompt_id: + currently_running, _ = self.prompt_queue.get_current_queue() + + # Check if the prompt_id matches any currently running prompt + should_interrupt = False + for item in currently_running: + # item structure: (number, prompt_id, prompt, extra_data, outputs_to_execute) + if item[1] == prompt_id: + logging.info(f"Interrupting prompt {prompt_id}") + should_interrupt = True + break + + if should_interrupt: + nodes.interrupt_processing() + else: + logging.info(f"Prompt {prompt_id} is not currently running, skipping interrupt") + else: + # No prompt_id provided, do a global interrupt + logging.info("Global interrupt (no prompt_id specified)") + nodes.interrupt_processing() + return web.Response(status=200) @routes.post("/free") From 1bcb469089a71fb1946b9f14e994df1b42b83def Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 2 Sep 2025 17:05:57 -0700 Subject: [PATCH 229/759] ImageScaleToMaxDimension node. (#9689) --- comfy_extras/nodes_images.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/comfy_extras/nodes_images.py b/comfy_extras/nodes_images.py index fba80e2ae..392aea32c 100644 --- a/comfy_extras/nodes_images.py +++ b/comfy_extras/nodes_images.py @@ -625,6 +625,37 @@ class ImageFlip: return (image,) +class ImageScaleToMaxDimension: + upscale_methods = ["area", "lanczos", "bilinear", "nearest-exact", "bilinear", "bicubic"] + + @classmethod + def INPUT_TYPES(s): + return {"required": {"image": ("IMAGE",), + "upscale_method": (s.upscale_methods,), + "largest_size": ("INT", {"default": 512, "min": 0, "max": MAX_RESOLUTION, "step": 1})}} + RETURN_TYPES = ("IMAGE",) + FUNCTION = "upscale" + + CATEGORY = "image/upscaling" + + def upscale(self, image, upscale_method, largest_size): + height = image.shape[1] + width = image.shape[2] + + if height > width: + width = round((width / height) * largest_size) + height = largest_size + elif width > height: + height = round((height / width) * largest_size) + width = largest_size + else: + height = largest_size + width = largest_size + + samples = image.movedim(-1, 1) + s = comfy.utils.common_upscale(samples, width, height, upscale_method, "disabled") + s = s.movedim(1, -1) + return (s,) NODE_CLASS_MAPPINGS = { "ImageCrop": ImageCrop, @@ -639,4 +670,5 @@ NODE_CLASS_MAPPINGS = { "GetImageSize": GetImageSize, "ImageRotate": ImageRotate, "ImageFlip": ImageFlip, + "ImageScaleToMaxDimension": ImageScaleToMaxDimension, } From 4f5812b93712e0f52ae8fe80a89e8b5e7d0fa309 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 3 Sep 2025 08:06:41 +0800 Subject: [PATCH 230/759] Update template to 0.1.73 (#9686) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7f64aacca..4ebe6cc2a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.11 -comfyui-workflow-templates==0.1.70 +comfyui-workflow-templates==0.1.73 comfyui-embedded-docs==0.2.6 torch torchsde From 26d5b86da8ceb4589ee70f12ff2209b93a2d99e0 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 3 Sep 2025 23:17:07 +0300 Subject: [PATCH 231/759] feat(api-nodes): add ByteDance Image nodes (#9477) --- comfy_api_nodes/nodes_bytedance.py | 336 +++++++++++++++++++++++++++++ nodes.py | 1 + 2 files changed, 337 insertions(+) create mode 100644 comfy_api_nodes/nodes_bytedance.py diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py new file mode 100644 index 000000000..fb6aba7fa --- /dev/null +++ b/comfy_api_nodes/nodes_bytedance.py @@ -0,0 +1,336 @@ +import logging +from enum import Enum +from typing import Optional +from typing_extensions import override + +import torch +from pydantic import BaseModel, Field + +from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api_nodes.util.validation_utils import ( + validate_image_aspect_ratio_range, + get_number_of_images, +) +from comfy_api_nodes.apis.client import ( + ApiEndpoint, + HttpMethod, + SynchronousOperation, +) +from comfy_api_nodes.apinode_utils import download_url_to_image_tensor, upload_images_to_comfyapi, validate_string + + +BYTEPLUS_ENDPOINT = "/proxy/byteplus/api/v3/images/generations" + + +class Text2ImageModelName(str, Enum): + seedream3 = "seedream-3-0-t2i-250415" + + +class Image2ImageModelName(str, Enum): + seededit3 = "seededit-3-0-i2i-250628" + + +class Text2ImageTaskCreationRequest(BaseModel): + model: Text2ImageModelName = Text2ImageModelName.seedream3 + prompt: str = Field(...) + response_format: Optional[str] = Field("url") + size: Optional[str] = Field(None) + seed: Optional[int] = Field(0, ge=0, le=2147483647) + guidance_scale: Optional[float] = Field(..., ge=1.0, le=10.0) + watermark: Optional[bool] = Field(True) + + +class Image2ImageTaskCreationRequest(BaseModel): + model: Image2ImageModelName = Image2ImageModelName.seededit3 + prompt: str = Field(...) + response_format: Optional[str] = Field("url") + image: str = Field(..., description="Base64 encoded string or image URL") + size: Optional[str] = Field("adaptive") + seed: Optional[int] = Field(..., ge=0, le=2147483647) + guidance_scale: Optional[float] = Field(..., ge=1.0, le=10.0) + watermark: Optional[bool] = Field(True) + + +class ImageTaskCreationResponse(BaseModel): + model: str = Field(...) + created: int = Field(..., description="Unix timestamp (in seconds) indicating time when the request was created.") + data: list = Field([], description="Contains information about the generated image(s).") + error: dict = Field({}, description="Contains `code` and `message` fields in case of error.") + + +RECOMMENDED_PRESETS = [ + ("1024x1024 (1:1)", 1024, 1024), + ("864x1152 (3:4)", 864, 1152), + ("1152x864 (4:3)", 1152, 864), + ("1280x720 (16:9)", 1280, 720), + ("720x1280 (9:16)", 720, 1280), + ("832x1248 (2:3)", 832, 1248), + ("1248x832 (3:2)", 1248, 832), + ("1512x648 (21:9)", 1512, 648), + ("2048x2048 (1:1)", 2048, 2048), + ("Custom", None, None), +] + + +def get_image_url_from_response(response: ImageTaskCreationResponse) -> str: + if response.error: + error_msg = f"ByteDance request failed. Code: {response.error['code']}, message: {response.error['message']}" + logging.info(error_msg) + raise RuntimeError(error_msg) + logging.info("ByteDance task succeeded, image URL: %s", response.data[0]["url"]) + return response.data[0]["url"] + + +class ByteDanceImageNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ByteDanceImageNode", + display_name="ByteDance Image", + category="api node/image/ByteDance", + description="Generate images using ByteDance models via api based on prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in Text2ImageModelName], + default=Text2ImageModelName.seedream3.value, + tooltip="Model name", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="The text prompt used to generate the image", + ), + comfy_io.Combo.Input( + "size_preset", + options=[label for label, _, _ in RECOMMENDED_PRESETS], + tooltip="Pick a recommended size. Select Custom to use the width and height below", + ), + comfy_io.Int.Input( + "width", + default=1024, + min=512, + max=2048, + step=64, + tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`", + ), + comfy_io.Int.Input( + "height", + default=1024, + min=512, + max=2048, + step=64, + tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation", + optional=True, + ), + comfy_io.Float.Input( + "guidance_scale", + default=2.5, + min=1.0, + max=10.0, + step=0.01, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Higher value makes the image follow the prompt more closely", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the image", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + size_preset: str, + width: int, + height: int, + seed: int, + guidance_scale: float, + watermark: bool, + ) -> comfy_io.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + w = h = None + for label, tw, th in RECOMMENDED_PRESETS: + if label == size_preset: + w, h = tw, th + break + + if w is None or h is None: + w, h = width, height + if not (512 <= w <= 2048) or not (512 <= h <= 2048): + raise ValueError( + f"Custom size out of range: {w}x{h}. " + "Both width and height must be between 512 and 2048 pixels." + ) + + payload = Text2ImageTaskCreationRequest( + model=model, + prompt=prompt, + size=f"{w}x{h}", + seed=seed, + guidance_scale=guidance_scale, + watermark=watermark, + ) + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + response = await SynchronousOperation( + endpoint=ApiEndpoint( + path=BYTEPLUS_ENDPOINT, + method=HttpMethod.POST, + request_model=Text2ImageTaskCreationRequest, + response_model=ImageTaskCreationResponse, + ), + request=payload, + auth_kwargs=auth_kwargs, + ).execute() + return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) + + +class ByteDanceImageEditNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ByteDanceImageEditNode", + display_name="ByteDance Image Edit", + category="api node/video/ByteDance", + description="Edit images using ByteDance models via api based on prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in Image2ImageModelName], + default=Image2ImageModelName.seededit3.value, + tooltip="Model name", + ), + comfy_io.Image.Input( + "image", + tooltip="The base image to edit", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Instruction to edit image", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation", + optional=True, + ), + comfy_io.Float.Input( + "guidance_scale", + default=5.5, + min=1.0, + max=10.0, + step=0.01, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Higher value makes the image follow the prompt more closely", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the image", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + image: torch.Tensor, + prompt: str, + seed: int, + guidance_scale: float, + watermark: bool, + ) -> comfy_io.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + if get_number_of_images(image) != 1: + raise ValueError("Exactly one input image is required.") + validate_image_aspect_ratio_range(image, (1, 3), (3, 1)) + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + source_url = (await upload_images_to_comfyapi( + image, + max_images=1, + mime_type="image/png", + auth_kwargs=auth_kwargs, + ))[0] + payload = Image2ImageTaskCreationRequest( + model=model, + prompt=prompt, + image=source_url, + seed=seed, + guidance_scale=guidance_scale, + watermark=watermark, + ) + response = await SynchronousOperation( + endpoint=ApiEndpoint( + path=BYTEPLUS_ENDPOINT, + method=HttpMethod.POST, + request_model=Image2ImageTaskCreationRequest, + response_model=ImageTaskCreationResponse, + ), + request=payload, + auth_kwargs=auth_kwargs, + ).execute() + return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) + + +class ByteDanceExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + ByteDanceImageNode, + ByteDanceImageEditNode, + ] + +async def comfy_entrypoint() -> ByteDanceExtension: + return ByteDanceExtension() diff --git a/nodes.py b/nodes.py index 0aff6b14a..6c2f9dd14 100644 --- a/nodes.py +++ b/nodes.py @@ -2344,6 +2344,7 @@ async def init_builtin_api_nodes(): "nodes_veo2.py", "nodes_kling.py", "nodes_bfl.py", + "nodes_bytedance.py", "nodes_luma.py", "nodes_recraft.py", "nodes_pixverse.py", From 50333f1715c03aa4100711eb6d075516a4021d24 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 3 Sep 2025 23:17:37 +0300 Subject: [PATCH 232/759] api nodes(Ideogram): add Ideogram Character (#9616) * api nodes(Ideogram): add Ideogram Character * rename renderingSpeed default value from 'balanced' to 'default' --- comfy_api_nodes/apis/__init__.py | 22 ++++++++- comfy_api_nodes/nodes_ideogram.py | 77 ++++++++++++++++++++++++++++--- 2 files changed, 91 insertions(+), 8 deletions(-) diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index 7a09df55b..78a23db30 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -951,7 +951,11 @@ class MagicPrompt2(str, Enum): class StyleType1(str, Enum): + AUTO = 'AUTO' GENERAL = 'GENERAL' + REALISTIC = 'REALISTIC' + DESIGN = 'DESIGN' + FICTION = 'FICTION' class ImagenImageGenerationInstance(BaseModel): @@ -2676,7 +2680,7 @@ class ReleaseNote(BaseModel): class RenderingSpeed(str, Enum): - BALANCED = 'BALANCED' + DEFAULT = 'DEFAULT' TURBO = 'TURBO' QUALITY = 'QUALITY' @@ -4918,6 +4922,14 @@ class IdeogramV3EditRequest(BaseModel): None, description='A set of images to use as style references (maximum total size 10MB across all style references). The images should be in JPEG, PNG or WebP format.', ) + character_reference_images: Optional[List[str]] = Field( + None, + description='Generations with character reference are subject to the character reference pricing. A set of images to use as character references (maximum total size 10MB across all character references), currently only supports 1 character reference image. The images should be in JPEG, PNG or WebP format.' + ) + character_reference_images_mask: Optional[List[str]] = Field( + None, + description='Optional masks for character reference images. When provided, must match the number of character_reference_images. Each mask should be a grayscale image of the same dimensions as the corresponding character reference image. The images should be in JPEG, PNG or WebP format.' + ) class IdeogramV3Request(BaseModel): @@ -4951,6 +4963,14 @@ class IdeogramV3Request(BaseModel): style_type: Optional[StyleType1] = Field( None, description='The type of style to apply' ) + character_reference_images: Optional[List[str]] = Field( + None, + description='Generations with character reference are subject to the character reference pricing. A set of images to use as character references (maximum total size 10MB across all character references), currently only supports 1 character reference image. The images should be in JPEG, PNG or WebP format.' + ) + character_reference_images_mask: Optional[List[str]] = Field( + None, + description='Optional masks for character reference images. When provided, must match the number of character_reference_images. Each mask should be a grayscale image of the same dimensions as the corresponding character reference image. The images should be in JPEG, PNG or WebP format.' + ) class ImagenGenerateImageResponse(BaseModel): diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index d28895f3e..2d1c32e4f 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -255,6 +255,7 @@ class IdeogramV1(comfy_io.ComfyNode): display_name="Ideogram V1", category="api node/image/Ideogram", description="Generates images using the Ideogram V1 model.", + is_api_node=True, inputs=[ comfy_io.String.Input( "prompt", @@ -383,6 +384,7 @@ class IdeogramV2(comfy_io.ComfyNode): display_name="Ideogram V2", category="api node/image/Ideogram", description="Generates images using the Ideogram V2 model.", + is_api_node=True, inputs=[ comfy_io.String.Input( "prompt", @@ -552,6 +554,7 @@ class IdeogramV3(comfy_io.ComfyNode): category="api node/image/Ideogram", description="Generates images using the Ideogram V3 model. " "Supports both regular image generation from text prompts and image editing with mask.", + is_api_node=True, inputs=[ comfy_io.String.Input( "prompt", @@ -612,11 +615,21 @@ class IdeogramV3(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "rendering_speed", - options=["BALANCED", "TURBO", "QUALITY"], - default="BALANCED", + options=["DEFAULT", "TURBO", "QUALITY"], + default="DEFAULT", tooltip="Controls the trade-off between generation speed and quality", optional=True, ), + comfy_io.Image.Input( + "character_image", + tooltip="Image to use as character reference.", + optional=True, + ), + comfy_io.Mask.Input( + "character_mask", + tooltip="Optional mask for character reference image.", + optional=True, + ), ], outputs=[ comfy_io.Image.Output(), @@ -639,12 +652,46 @@ class IdeogramV3(comfy_io.ComfyNode): magic_prompt_option="AUTO", seed=0, num_images=1, - rendering_speed="BALANCED", + rendering_speed="DEFAULT", + character_image=None, + character_mask=None, ): auth = { "auth_token": cls.hidden.auth_token_comfy_org, "comfy_api_key": cls.hidden.api_key_comfy_org, } + if rendering_speed == "BALANCED": # for backward compatibility + rendering_speed = "DEFAULT" + + character_img_binary = None + character_mask_binary = None + + if character_image is not None: + input_tensor = character_image.squeeze().cpu() + if character_mask is not None: + character_mask = resize_mask_to_image(character_mask, character_image, allow_gradient=False) + character_mask = 1.0 - character_mask + if character_mask.shape[1:] != character_image.shape[1:-1]: + raise Exception("Character mask and image must be the same size") + + mask_np = (character_mask.squeeze().cpu().numpy() * 255).astype(np.uint8) + mask_img = Image.fromarray(mask_np) + mask_byte_arr = BytesIO() + mask_img.save(mask_byte_arr, format="PNG") + mask_byte_arr.seek(0) + character_mask_binary = mask_byte_arr + character_mask_binary.name = "mask.png" + + img_np = (input_tensor.numpy() * 255).astype(np.uint8) + img = Image.fromarray(img_np) + img_byte_arr = BytesIO() + img.save(img_byte_arr, format="PNG") + img_byte_arr.seek(0) + character_img_binary = img_byte_arr + character_img_binary.name = "image.png" + elif character_mask is not None: + raise Exception("Character mask requires character image to be present") + # Check if both image and mask are provided for editing mode if image is not None and mask is not None: # Edit mode @@ -693,6 +740,15 @@ class IdeogramV3(comfy_io.ComfyNode): if num_images > 1: edit_request.num_images = num_images + files = { + "image": img_binary, + "mask": mask_binary, + } + if character_img_binary: + files["character_reference_images"] = character_img_binary + if character_mask_binary: + files["character_mask_binary"] = character_mask_binary + # Execute the operation for edit mode operation = SynchronousOperation( endpoint=ApiEndpoint( @@ -702,10 +758,7 @@ class IdeogramV3(comfy_io.ComfyNode): response_model=IdeogramGenerateResponse, ), request=edit_request, - files={ - "image": img_binary, - "mask": mask_binary, - }, + files=files, content_type="multipart/form-data", auth_kwargs=auth, ) @@ -739,6 +792,14 @@ class IdeogramV3(comfy_io.ComfyNode): if num_images > 1: gen_request.num_images = num_images + files = {} + if character_img_binary: + files["character_reference_images"] = character_img_binary + if character_mask_binary: + files["character_mask_binary"] = character_mask_binary + if files: + gen_request.style_type = "AUTO" + # Execute the operation for generation mode operation = SynchronousOperation( endpoint=ApiEndpoint( @@ -748,6 +809,8 @@ class IdeogramV3(comfy_io.ComfyNode): response_model=IdeogramGenerateResponse, ), request=gen_request, + files=files if files else None, + content_type="multipart/form-data", auth_kwargs=auth, ) From 22da0a83e9a251ca16b9753bf808bfa9f4b023d8 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 3 Sep 2025 23:18:27 +0300 Subject: [PATCH 233/759] [V3] convert Runway API nodes to the V3 schema (#9487) * convert RunAway API nodes to the V3 schema * fixed small typo * fix: add tooltip for "seed" input --- comfy_api_nodes/nodes_runway.py | 744 +++++++++++++++----------------- 1 file changed, 357 insertions(+), 387 deletions(-) diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py index 98024a9fa..27b2bf748 100644 --- a/comfy_api_nodes/nodes_runway.py +++ b/comfy_api_nodes/nodes_runway.py @@ -12,6 +12,7 @@ User Guides: """ from typing import Union, Optional, Any +from typing_extensions import override from enum import Enum import torch @@ -46,9 +47,9 @@ from comfy_api_nodes.apinode_utils import ( validate_string, download_url_to_image_tensor, ) -from comfy_api_nodes.mapper_utils import model_field_to_node_input from comfy_api.input_impl import VideoFromFile -from comfy.comfy_types.node_typing import IO, ComfyNodeABC +from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api_nodes.util.validation_utils import validate_image_dimensions, validate_image_aspect_ratio PATH_IMAGE_TO_VIDEO = "/proxy/runway/image_to_video" PATH_TEXT_TO_IMAGE = "/proxy/runway/text_to_image" @@ -85,20 +86,11 @@ class RunwayGen3aAspectRatio(str, Enum): def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]: """Returns the video URL from the task status response if it exists.""" - if response.output and len(response.output) > 0: + if hasattr(response, "output") and len(response.output) > 0: return response.output[0] return None -# TODO: replace with updated image validation utils (upstream) -def validate_input_image(image: torch.Tensor) -> bool: - """ - Validate the input image is within the size limits for the Runway API. - See: https://docs.dev.runwayml.com/assets/inputs/#common-error-reasons - """ - return image.shape[2] < 8000 and image.shape[1] < 8000 - - async def poll_until_finished( auth_kwargs: dict[str, str], api_endpoint: ApiEndpoint[Any, TaskStatusResponse], @@ -134,458 +126,438 @@ def extract_progress_from_task_status( def get_image_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]: """Returns the image URL from the task status response if it exists.""" - if response.output and len(response.output) > 0: + if hasattr(response, "output") and len(response.output) > 0: return response.output[0] return None -class RunwayVideoGenNode(ComfyNodeABC): - """Runway Video Node Base.""" +async def get_response( + task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None, estimated_duration: Optional[int] = None +) -> TaskStatusResponse: + """Poll the task status until it is finished then get the response.""" + return await poll_until_finished( + auth_kwargs, + ApiEndpoint( + path=f"{PATH_GET_TASK_STATUS}/{task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=TaskStatusResponse, + ), + estimated_duration=estimated_duration, + node_id=node_id, + ) - RETURN_TYPES = ("VIDEO",) - FUNCTION = "api_call" - CATEGORY = "api node/video/Runway" - API_NODE = True - def validate_task_created(self, response: RunwayImageToVideoResponse) -> bool: - """ - Validate the task creation response from the Runway API matches - expected format. - """ - if not bool(response.id): - raise RunwayApiError("Invalid initial response from Runway API.") - return True +async def generate_video( + request: RunwayImageToVideoRequest, + auth_kwargs: dict[str, str], + node_id: Optional[str] = None, + estimated_duration: Optional[int] = None, +) -> VideoFromFile: + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path=PATH_IMAGE_TO_VIDEO, + method=HttpMethod.POST, + request_model=RunwayImageToVideoRequest, + response_model=RunwayImageToVideoResponse, + ), + request=request, + auth_kwargs=auth_kwargs, + ) - def validate_response(self, response: RunwayImageToVideoResponse) -> bool: - """ - Validate the successful task status response from the Runway API - matches expected format. - """ - if not response.output or len(response.output) == 0: - raise RunwayApiError( - "Runway task succeeded but no video data found in response." - ) - return True + initial_response = await initial_operation.execute() - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> RunwayImageToVideoResponse: - """Poll the task status until it is finished then get the response.""" - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_GET_TASK_STATUS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TaskStatusResponse, - ), - estimated_duration=AVERAGE_DURATION_FLF_SECONDS, - node_id=node_id, + final_response = await get_response(initial_response.id, auth_kwargs, node_id, estimated_duration) + if not final_response.output: + raise RunwayApiError("Runway task succeeded but no video data found in response.") + + video_url = get_video_url_from_task_status(final_response) + return await download_url_to_video_output(video_url) + + +class RunwayImageToVideoNodeGen3a(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="RunwayImageToVideoNodeGen3a", + display_name="Runway Image to Video (Gen3a Turbo)", + category="api node/video/Runway", + description="Generate a video from a single starting frame using Gen3a Turbo model. " + "Before diving in, review these best practices to ensure that " + "your input selections will set your generation up for success: " + "https://help.runwayml.com/hc/en-us/articles/33927968552339-Creating-with-Act-One-on-Gen-3-Alpha-and-Turbo.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for the generation", + ), + comfy_io.Image.Input( + "start_frame", + tooltip="Start frame to be used for the video", + ), + comfy_io.Combo.Input( + "duration", + options=[model.value for model in Duration], + ), + comfy_io.Combo.Input( + "ratio", + options=[model.value for model in RunwayGen3aAspectRatio], + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967295, + step=1, + control_after_generate=True, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Random seed for generation", + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) - async def generate_video( - self, - request: RunwayImageToVideoRequest, - auth_kwargs: dict[str, str], - node_id: Optional[str] = None, - ) -> tuple[VideoFromFile]: - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_IMAGE_TO_VIDEO, - method=HttpMethod.POST, - request_model=RunwayImageToVideoRequest, - response_model=RunwayImageToVideoResponse, - ), - request=request, + @classmethod + async def execute( + cls, + prompt: str, + start_frame: torch.Tensor, + duration: str, + ratio: str, + seed: int, + ) -> comfy_io.NodeOutput: + validate_string(prompt, min_length=1) + validate_image_dimensions(start_frame, max_width=7999, max_height=7999) + validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + + download_urls = await upload_images_to_comfyapi( + start_frame, + max_images=1, + mime_type="image/png", auth_kwargs=auth_kwargs, ) - initial_response = await initial_operation.execute() - self.validate_task_created(initial_response) - task_id = initial_response.id - - final_response = await self.get_response(task_id, auth_kwargs, node_id) - self.validate_response(final_response) - - video_url = get_video_url_from_task_status(final_response) - return (await download_url_to_video_output(video_url),) + return comfy_io.NodeOutput( + await generate_video( + RunwayImageToVideoRequest( + promptText=prompt, + seed=seed, + model=Model("gen3a_turbo"), + duration=Duration(duration), + ratio=AspectRatio(ratio), + promptImage=RunwayPromptImageObject( + root=[ + RunwayPromptImageDetailedObject( + uri=str(download_urls[0]), position="first" + ) + ] + ), + ), + auth_kwargs=auth_kwargs, + node_id=cls.hidden.unique_id, + ) + ) -class RunwayImageToVideoNodeGen3a(RunwayVideoGenNode): - """Runway Image to Video Node using Gen3a Turbo model.""" - - DESCRIPTION = "Generate a video from a single starting frame using Gen3a Turbo model. Before diving in, review these best practices to ensure that your input selections will set your generation up for success: https://help.runwayml.com/hc/en-us/articles/33927968552339-Creating-with-Act-One-on-Gen-3-Alpha-and-Turbo." +class RunwayImageToVideoNodeGen4(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, RunwayImageToVideoRequest, "promptText", multiline=True + def define_schema(cls): + return comfy_io.Schema( + node_id="RunwayImageToVideoNodeGen4", + display_name="Runway Image to Video (Gen4 Turbo)", + category="api node/video/Runway", + description="Generate a video from a single starting frame using Gen4 Turbo model. " + "Before diving in, review these best practices to ensure that " + "your input selections will set your generation up for success: " + "https://help.runwayml.com/hc/en-us/articles/37327109429011-Creating-with-Gen-4-Video.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for the generation", ), - "start_frame": ( - IO.IMAGE, - {"tooltip": "Start frame to be used for the video"}, + comfy_io.Image.Input( + "start_frame", + tooltip="Start frame to be used for the video", ), - "duration": model_field_to_node_input( - IO.COMBO, RunwayImageToVideoRequest, "duration", enum_type=Duration + comfy_io.Combo.Input( + "duration", + options=[model.value for model in Duration], ), - "ratio": model_field_to_node_input( - IO.COMBO, - RunwayImageToVideoRequest, + comfy_io.Combo.Input( "ratio", - enum_type=RunwayGen3aAspectRatio, + options=[model.value for model in RunwayGen4TurboAspectRatio], ), - "seed": model_field_to_node_input( - IO.INT, - RunwayImageToVideoRequest, + comfy_io.Int.Input( "seed", + default=0, + min=0, + max=4294967295, + step=1, control_after_generate=True, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Random seed for generation", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, start_frame: torch.Tensor, duration: str, ratio: str, seed: int, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile]: - # Validate inputs + ) -> comfy_io.NodeOutput: validate_string(prompt, min_length=1) - validate_input_image(start_frame) + validate_image_dimensions(start_frame, max_width=7999, max_height=7999) + validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } - # Upload image download_urls = await upload_images_to_comfyapi( start_frame, max_images=1, mime_type="image/png", - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) - if len(download_urls) != 1: - raise RunwayApiError("Failed to upload one or more images to comfy api.") - return await self.generate_video( - RunwayImageToVideoRequest( - promptText=prompt, - seed=seed, - model=Model("gen3a_turbo"), - duration=Duration(duration), - ratio=AspectRatio(ratio), - promptImage=RunwayPromptImageObject( - root=[ - RunwayPromptImageDetailedObject( - uri=str(download_urls[0]), position="first" - ) - ] + return comfy_io.NodeOutput( + await generate_video( + RunwayImageToVideoRequest( + promptText=prompt, + seed=seed, + model=Model("gen4_turbo"), + duration=Duration(duration), + ratio=AspectRatio(ratio), + promptImage=RunwayPromptImageObject( + root=[ + RunwayPromptImageDetailedObject( + uri=str(download_urls[0]), position="first" + ) + ] + ), ), - ), - auth_kwargs=kwargs, - node_id=unique_id, + auth_kwargs=auth_kwargs, + node_id=cls.hidden.unique_id, + estimated_duration=AVERAGE_DURATION_FLF_SECONDS, + ) ) -class RunwayImageToVideoNodeGen4(RunwayVideoGenNode): - """Runway Image to Video Node using Gen4 Turbo model.""" - - DESCRIPTION = "Generate a video from a single starting frame using Gen4 Turbo model. Before diving in, review these best practices to ensure that your input selections will set your generation up for success: https://help.runwayml.com/hc/en-us/articles/37327109429011-Creating-with-Gen-4-Video." +class RunwayFirstLastFrameNode(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, RunwayImageToVideoRequest, "promptText", multiline=True + def define_schema(cls): + return comfy_io.Schema( + node_id="RunwayFirstLastFrameNode", + display_name="Runway First-Last-Frame to Video", + category="api node/video/Runway", + description="Upload first and last keyframes, draft a prompt, and generate a video. " + "More complex transitions, such as cases where the Last frame is completely different " + "from the First frame, may benefit from the longer 10s duration. " + "This would give the generation more time to smoothly transition between the two inputs. " + "Before diving in, review these best practices to ensure that your input selections " + "will set your generation up for success: " + "https://help.runwayml.com/hc/en-us/articles/34170748696595-Creating-with-Keyframes-on-Gen-3.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for the generation", ), - "start_frame": ( - IO.IMAGE, - {"tooltip": "Start frame to be used for the video"}, + comfy_io.Image.Input( + "start_frame", + tooltip="Start frame to be used for the video", ), - "duration": model_field_to_node_input( - IO.COMBO, RunwayImageToVideoRequest, "duration", enum_type=Duration + comfy_io.Image.Input( + "end_frame", + tooltip="End frame to be used for the video. Supported for gen3a_turbo only.", ), - "ratio": model_field_to_node_input( - IO.COMBO, - RunwayImageToVideoRequest, + comfy_io.Combo.Input( + "duration", + options=[model.value for model in Duration], + ), + comfy_io.Combo.Input( "ratio", - enum_type=RunwayGen4TurboAspectRatio, + options=[model.value for model in RunwayGen3aAspectRatio], ), - "seed": model_field_to_node_input( - IO.INT, - RunwayImageToVideoRequest, + comfy_io.Int.Input( "seed", + default=0, + min=0, + max=4294967295, + step=1, control_after_generate=True, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Random seed for generation", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, - prompt: str, - start_frame: torch.Tensor, - duration: str, - ratio: str, - seed: int, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile]: - # Validate inputs - validate_string(prompt, min_length=1) - validate_input_image(start_frame) - - # Upload image - download_urls = await upload_images_to_comfyapi( - start_frame, - max_images=1, - mime_type="image/png", - auth_kwargs=kwargs, - ) - if len(download_urls) != 1: - raise RunwayApiError("Failed to upload one or more images to comfy api.") - - return await self.generate_video( - RunwayImageToVideoRequest( - promptText=prompt, - seed=seed, - model=Model("gen4_turbo"), - duration=Duration(duration), - ratio=AspectRatio(ratio), - promptImage=RunwayPromptImageObject( - root=[ - RunwayPromptImageDetailedObject( - uri=str(download_urls[0]), position="first" - ) - ] - ), - ), - auth_kwargs=kwargs, - node_id=unique_id, - ) - - -class RunwayFirstLastFrameNode(RunwayVideoGenNode): - """Runway First-Last Frame Node.""" - - DESCRIPTION = "Upload first and last keyframes, draft a prompt, and generate a video. More complex transitions, such as cases where the Last frame is completely different from the First frame, may benefit from the longer 10s duration. This would give the generation more time to smoothly transition between the two inputs. Before diving in, review these best practices to ensure that your input selections will set your generation up for success: https://help.runwayml.com/hc/en-us/articles/34170748696595-Creating-with-Keyframes-on-Gen-3." - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> RunwayImageToVideoResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_GET_TASK_STATUS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TaskStatusResponse, - ), - estimated_duration=AVERAGE_DURATION_FLF_SECONDS, - node_id=node_id, + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, RunwayImageToVideoRequest, "promptText", multiline=True - ), - "start_frame": ( - IO.IMAGE, - {"tooltip": "Start frame to be used for the video"}, - ), - "end_frame": ( - IO.IMAGE, - { - "tooltip": "End frame to be used for the video. Supported for gen3a_turbo only." - }, - ), - "duration": model_field_to_node_input( - IO.COMBO, RunwayImageToVideoRequest, "duration", enum_type=Duration - ), - "ratio": model_field_to_node_input( - IO.COMBO, - RunwayImageToVideoRequest, - "ratio", - enum_type=RunwayGen3aAspectRatio, - ), - "seed": model_field_to_node_input( - IO.INT, - RunwayImageToVideoRequest, - "seed", - control_after_generate=True, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "unique_id": "UNIQUE_ID", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, start_frame: torch.Tensor, end_frame: torch.Tensor, duration: str, ratio: str, seed: int, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile]: - # Validate inputs + ) -> comfy_io.NodeOutput: validate_string(prompt, min_length=1) - validate_input_image(start_frame) - validate_input_image(end_frame) + validate_image_dimensions(start_frame, max_width=7999, max_height=7999) + validate_image_dimensions(end_frame, max_width=7999, max_height=7999) + validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + validate_image_aspect_ratio(end_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } - # Upload images stacked_input_images = image_tensor_pair_to_batch(start_frame, end_frame) download_urls = await upload_images_to_comfyapi( stacked_input_images, max_images=2, mime_type="image/png", - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) if len(download_urls) != 2: raise RunwayApiError("Failed to upload one or more images to comfy api.") - return await self.generate_video( - RunwayImageToVideoRequest( - promptText=prompt, - seed=seed, - model=Model("gen3a_turbo"), - duration=Duration(duration), - ratio=AspectRatio(ratio), - promptImage=RunwayPromptImageObject( - root=[ - RunwayPromptImageDetailedObject( - uri=str(download_urls[0]), position="first" - ), - RunwayPromptImageDetailedObject( - uri=str(download_urls[1]), position="last" - ), - ] + return comfy_io.NodeOutput( + await generate_video( + RunwayImageToVideoRequest( + promptText=prompt, + seed=seed, + model=Model("gen3a_turbo"), + duration=Duration(duration), + ratio=AspectRatio(ratio), + promptImage=RunwayPromptImageObject( + root=[ + RunwayPromptImageDetailedObject( + uri=str(download_urls[0]), position="first" + ), + RunwayPromptImageDetailedObject( + uri=str(download_urls[1]), position="last" + ), + ] + ), ), - ), - auth_kwargs=kwargs, - node_id=unique_id, + auth_kwargs=auth_kwargs, + node_id=cls.hidden.unique_id, + estimated_duration=AVERAGE_DURATION_FLF_SECONDS, + ) ) -class RunwayTextToImageNode(ComfyNodeABC): - """Runway Text to Image Node.""" - - RETURN_TYPES = ("IMAGE",) - FUNCTION = "api_call" - CATEGORY = "api node/image/Runway" - API_NODE = True - DESCRIPTION = "Generate an image from a text prompt using Runway's Gen 4 model. You can also include reference images to guide the generation." +class RunwayTextToImageNode(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, RunwayTextToImageRequest, "promptText", multiline=True + def define_schema(cls): + return comfy_io.Schema( + node_id="RunwayTextToImageNode", + display_name="Runway Text to Image", + category="api node/image/Runway", + description="Generate an image from a text prompt using Runway's Gen 4 model. " + "You can also include reference image to guide the generation.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for the generation", ), - "ratio": model_field_to_node_input( - IO.COMBO, - RunwayTextToImageRequest, + comfy_io.Combo.Input( "ratio", - enum_type=RunwayTextToImageAspectRatioEnum, + options=[model.value for model in RunwayTextToImageAspectRatioEnum], ), - }, - "optional": { - "reference_image": ( - IO.IMAGE, - {"tooltip": "Optional reference image to guide the generation"}, - ) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - def validate_task_created(self, response: RunwayTextToImageResponse) -> bool: - """ - Validate the task creation response from the Runway API matches - expected format. - """ - if not bool(response.id): - raise RunwayApiError("Invalid initial response from Runway API.") - return True - - def validate_response(self, response: TaskStatusResponse) -> bool: - """ - Validate the successful task status response from the Runway API - matches expected format. - """ - if not response.output or len(response.output) == 0: - raise RunwayApiError( - "Runway task succeeded but no image data found in response." - ) - return True - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> TaskStatusResponse: - """Poll the task status until it is finished then get the response.""" - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_GET_TASK_STATUS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TaskStatusResponse, - ), - estimated_duration=AVERAGE_DURATION_T2I_SECONDS, - node_id=node_id, + comfy_io.Image.Input( + "reference_image", + tooltip="Optional reference image to guide the generation", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, ratio: str, reference_image: Optional[torch.Tensor] = None, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[torch.Tensor]: - # Validate inputs + ) -> comfy_io.NodeOutput: validate_string(prompt, min_length=1) + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + # Prepare reference images if provided reference_images = None if reference_image is not None: - validate_input_image(reference_image) + validate_image_dimensions(reference_image, max_width=7999, max_height=7999) + validate_image_aspect_ratio(reference_image, min_aspect_ratio=0.5, max_aspect_ratio=2.0) download_urls = await upload_images_to_comfyapi( reference_image, max_images=1, mime_type="image/png", - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) - if len(download_urls) != 1: - raise RunwayApiError("Failed to upload reference image to comfy api.") - reference_images = [ReferenceImage(uri=str(download_urls[0]))] - # Create request request = RunwayTextToImageRequest( promptText=prompt, model=Model4.gen4_image, @@ -593,7 +565,6 @@ class RunwayTextToImageNode(ComfyNodeABC): referenceImages=reference_images, ) - # Execute initial request initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_TEXT_TO_IMAGE, @@ -602,34 +573,33 @@ class RunwayTextToImageNode(ComfyNodeABC): response_model=RunwayTextToImageResponse, ), request=request, - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) initial_response = await initial_operation.execute() - self.validate_task_created(initial_response) - task_id = initial_response.id # Poll for completion - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id + final_response = await get_response( + initial_response.id, + auth_kwargs=auth_kwargs, + node_id=cls.hidden.unique_id, + estimated_duration=AVERAGE_DURATION_T2I_SECONDS, ) - self.validate_response(final_response) + if not final_response.output: + raise RunwayApiError("Runway task succeeded but no image data found in response.") - # Download and return image - image_url = get_image_url_from_task_status(final_response) - return (await download_url_to_image_tensor(image_url),) + return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_task_status(final_response))) -NODE_CLASS_MAPPINGS = { - "RunwayFirstLastFrameNode": RunwayFirstLastFrameNode, - "RunwayImageToVideoNodeGen3a": RunwayImageToVideoNodeGen3a, - "RunwayImageToVideoNodeGen4": RunwayImageToVideoNodeGen4, - "RunwayTextToImageNode": RunwayTextToImageNode, -} +class RunwayExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + RunwayFirstLastFrameNode, + RunwayImageToVideoNodeGen3a, + RunwayImageToVideoNodeGen4, + RunwayTextToImageNode, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "RunwayFirstLastFrameNode": "Runway First-Last-Frame to Video", - "RunwayImageToVideoNodeGen3a": "Runway Image to Video (Gen3a Turbo)", - "RunwayImageToVideoNodeGen4": "Runway Image to Video (Gen4 Turbo)", - "RunwayTextToImageNode": "Runway Text to Image", -} +async def comfy_entrypoint() -> RunwayExtension: + return RunwayExtension() From 4368d8f87f580f526e8b2bc43bf0ac88e2b67033 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 3 Sep 2025 15:43:29 -0700 Subject: [PATCH 234/759] Update comment in api example. (#9708) --- script_examples/basic_api_example.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/script_examples/basic_api_example.py b/script_examples/basic_api_example.py index 9128420c4..7e20cc2c1 100644 --- a/script_examples/basic_api_example.py +++ b/script_examples/basic_api_example.py @@ -3,11 +3,7 @@ from urllib import request #This is the ComfyUI api prompt format. -#If you want it for a specific workflow you can "enable dev mode options" -#in the settings of the UI (gear beside the "Queue Size: ") this will enable -#a button on the UI to save workflows in api format. - -#keep in mind ComfyUI is pre alpha software so this format will change a bit. +#If you want it for a specific workflow you can "File -> Export (API)" in the interface. #this is the one for the default workflow prompt_text = """ From f48d05a2d17fe1a69e08fbabfb080e3779b36225 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 4 Sep 2025 04:21:38 +0300 Subject: [PATCH 235/759] convert AlignYourStepsScheduler node to V3 schema (#9226) --- comfy_extras/nodes_align_your_steps.py | 50 +++++++++++++++++--------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/comfy_extras/nodes_align_your_steps.py b/comfy_extras/nodes_align_your_steps.py index 8d856d0e8..edd5dadd4 100644 --- a/comfy_extras/nodes_align_your_steps.py +++ b/comfy_extras/nodes_align_your_steps.py @@ -1,6 +1,10 @@ #from: https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html import numpy as np import torch +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io + def loglinear_interp(t_steps, num_steps): """ @@ -19,25 +23,30 @@ NOISE_LEVELS = {"SD1": [14.6146412293, 6.4745760956, 3.8636745985, 2.694615152 "SDXL":[14.6146412293, 6.3184485287, 3.7681790315, 2.1811480769, 1.3405244945, 0.8620721141, 0.5550693289, 0.3798540708, 0.2332364134, 0.1114188177, 0.0291671582], "SVD": [700.00, 54.5, 15.886, 7.977, 4.248, 1.789, 0.981, 0.403, 0.173, 0.034, 0.002]} -class AlignYourStepsScheduler: +class AlignYourStepsScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model_type": (["SD1", "SDXL", "SVD"], ), - "steps": ("INT", {"default": 10, "min": 1, "max": 10000}), - "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" - - FUNCTION = "get_sigmas" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="AlignYourStepsScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Combo.Input("model_type", options=["SD1", "SDXL", "SVD"]), + io.Int.Input("steps", default=10, min=1, max=10000), + io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01), + ], + outputs=[io.Sigmas.Output()], + ) def get_sigmas(self, model_type, steps, denoise): + # Deprecated: use the V3 schema's `execute` method instead of this. + return AlignYourStepsScheduler().execute(model_type, steps, denoise).result + + @classmethod + def execute(cls, model_type, steps, denoise) -> io.NodeOutput: total_steps = steps if denoise < 1.0: if denoise <= 0.0: - return (torch.FloatTensor([]),) + return io.NodeOutput(torch.FloatTensor([])) total_steps = round(steps * denoise) sigmas = NOISE_LEVELS[model_type][:] @@ -46,8 +55,15 @@ class AlignYourStepsScheduler: sigmas = sigmas[-(total_steps + 1):] sigmas[-1] = 0 - return (torch.FloatTensor(sigmas), ) + return io.NodeOutput(torch.FloatTensor(sigmas)) -NODE_CLASS_MAPPINGS = { - "AlignYourStepsScheduler": AlignYourStepsScheduler, -} + +class AlignYourStepsExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + AlignYourStepsScheduler, + ] + +async def comfy_entrypoint() -> AlignYourStepsExtension: + return AlignYourStepsExtension() From 72855db715096bc378817b1aaffcf232fdc39659 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 3 Sep 2025 19:20:13 -0700 Subject: [PATCH 236/759] Fix potential rope issue. (#9710) --- comfy/ldm/audio/dit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ldm/audio/dit.py b/comfy/ldm/audio/dit.py index 179c5b67e..d0d69bbdc 100644 --- a/comfy/ldm/audio/dit.py +++ b/comfy/ldm/audio/dit.py @@ -632,7 +632,7 @@ class ContinuousTransformer(nn.Module): # Attention layers if self.rotary_pos_emb is not None: - rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1], dtype=x.dtype, device=x.device) + rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1], dtype=torch.float, device=x.device) else: rotary_pos_emb = None From b71f9bcb7143b8cd4fff627bb91b60739c915d4c Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Thu, 4 Sep 2025 14:14:02 +0800 Subject: [PATCH 237/759] Update template to 0.1.75 (#9711) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4ebe6cc2a..3008a5dc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.11 -comfyui-workflow-templates==0.1.73 +comfyui-workflow-templates==0.1.75 comfyui-embedded-docs==0.2.6 torch torchsde From b0338e930bbc1f9d01f005f224573d5994732932 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Thu, 4 Sep 2025 02:15:57 -0400 Subject: [PATCH 238/759] ComfyUI 0.3.57 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index e8e039373..4cc3c8647 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.56" +__version__ = "0.3.57" diff --git a/pyproject.toml b/pyproject.toml index cfd5d45ef..d75cd04a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.56" +version = "0.3.57" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From a9f1bb10a52ce08a3f21e6fc562554671c85c3d5 Mon Sep 17 00:00:00 2001 From: guill Date: Thu, 4 Sep 2025 16:13:28 -0700 Subject: [PATCH 239/759] Fix progress update crossover between users (#9706) * Fix showing progress from other sessions Because `client_id` was missing from ths `progress_state` message, it was being sent to all connected sessions. This technically meant that if someone had a graph with the same nodes, they would see the progress updates for others. Also added a test to prevent reoccurance and moved the tests around to make CI easier to hook up. * Fix CI issues related to timing-sensitive tests --- .github/workflows/test-execution.yml | 30 +++ comfy_execution/progress.py | 3 +- tests/conftest.py | 6 + .../extra_model_paths.yaml | 0 .../test_async_nodes.py | 28 ++- .../test_execution.py | 13 +- tests/execution/test_progress_isolation.py | 233 ++++++++++++++++++ .../testing_nodes/testing-pack/__init__.py | 0 .../testing-pack/api_test_nodes.py | 0 .../testing-pack/async_test_nodes.py | 0 .../testing_nodes/testing-pack/conditions.py | 0 .../testing-pack/flow_control.py | 0 .../testing-pack/specific_tests.py | 0 .../testing_nodes/testing-pack/stubs.py | 0 .../testing_nodes/testing-pack/tools.py | 0 .../testing_nodes/testing-pack/util.py | 0 16 files changed, 295 insertions(+), 18 deletions(-) create mode 100644 .github/workflows/test-execution.yml rename tests/{inference => execution}/extra_model_paths.yaml (100%) rename tests/{inference => execution}/test_async_nodes.py (95%) rename tests/{inference => execution}/test_execution.py (98%) create mode 100644 tests/execution/test_progress_isolation.py rename tests/{inference => execution}/testing_nodes/testing-pack/__init__.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/api_test_nodes.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/async_test_nodes.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/conditions.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/flow_control.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/specific_tests.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/stubs.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/tools.py (100%) rename tests/{inference => execution}/testing_nodes/testing-pack/util.py (100%) diff --git a/.github/workflows/test-execution.yml b/.github/workflows/test-execution.yml new file mode 100644 index 000000000..00ef07ebf --- /dev/null +++ b/.github/workflows/test-execution.yml @@ -0,0 +1,30 @@ +name: Execution Tests + +on: + push: + branches: [ main, master ] + pull_request: + branches: [ main, master ] + +jobs: + test: + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + runs-on: ${{ matrix.os }} + continue-on-error: true + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + - name: Install requirements + run: | + python -m pip install --upgrade pip + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install -r requirements.txt + pip install -r tests-unit/requirements.txt + - name: Run Execution Tests + run: | + python -m pytest tests/execution -v --skip-timing-checks diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py index e8f5ede1e..f951a3350 100644 --- a/comfy_execution/progress.py +++ b/comfy_execution/progress.py @@ -181,8 +181,9 @@ class WebUIProgressHandler(ProgressHandler): } # Send a combined progress_state message with all node states + # Include client_id to ensure message is only sent to the initiating client self.server_instance.send_sync( - "progress_state", {"prompt_id": prompt_id, "nodes": active_nodes} + "progress_state", {"prompt_id": prompt_id, "nodes": active_nodes}, self.server_instance.client_id ) @override diff --git a/tests/conftest.py b/tests/conftest.py index 4e30eb581..290e3a5c0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,7 @@ def pytest_addoption(parser): parser.addoption('--output_dir', action="store", default='tests/inference/samples', help='Output directory for generated images') parser.addoption("--listen", type=str, default="127.0.0.1", metavar="IP", nargs="?", const="0.0.0.0", help="Specify the IP address to listen on (default: 127.0.0.1). If --listen is provided without an argument, it defaults to 0.0.0.0. (listens on all)") parser.addoption("--port", type=int, default=8188, help="Set the listen port.") + parser.addoption("--skip-timing-checks", action="store_true", default=False, help="Skip timing-related assertions in tests (useful for CI environments with variable performance)") # This initializes args at the beginning of the test session @pytest.fixture(scope="session", autouse=True) @@ -19,6 +20,11 @@ def args_pytest(pytestconfig): return args +@pytest.fixture(scope="session") +def skip_timing_checks(pytestconfig): + """Fixture that returns whether timing checks should be skipped.""" + return pytestconfig.getoption("--skip-timing-checks") + def pytest_collection_modifyitems(items): # Modifies items so tests run in the correct order diff --git a/tests/inference/extra_model_paths.yaml b/tests/execution/extra_model_paths.yaml similarity index 100% rename from tests/inference/extra_model_paths.yaml rename to tests/execution/extra_model_paths.yaml diff --git a/tests/inference/test_async_nodes.py b/tests/execution/test_async_nodes.py similarity index 95% rename from tests/inference/test_async_nodes.py rename to tests/execution/test_async_nodes.py index f029953dd..c771b4b36 100644 --- a/tests/inference/test_async_nodes.py +++ b/tests/execution/test_async_nodes.py @@ -7,7 +7,7 @@ import subprocess from pytest import fixture from comfy_execution.graph_utils import GraphBuilder -from tests.inference.test_execution import ComfyClient, run_warmup +from tests.execution.test_execution import ComfyClient, run_warmup @pytest.mark.execution @@ -23,7 +23,7 @@ class TestAsyncNodes: '--output-directory', args_pytest["output_dir"], '--listen', args_pytest["listen"], '--port', str(args_pytest["port"]), - '--extra-model-paths-config', 'tests/inference/extra_model_paths.yaml', + '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml', '--cpu', ] use_lru, lru_size = request.param @@ -81,7 +81,7 @@ class TestAsyncNodes: assert len(result_images) == 1, "Should have 1 image" assert np.array(result_images[0]).min() == 0 and np.array(result_images[0]).max() == 0, "Image should be black" - def test_multiple_async_parallel_execution(self, client: ComfyClient, builder: GraphBuilder): + def test_multiple_async_parallel_execution(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): """Test that multiple async nodes execute in parallel.""" # Warmup execution to ensure server is fully initialized run_warmup(client) @@ -104,7 +104,8 @@ class TestAsyncNodes: elapsed_time = time.time() - start_time # Should take ~0.5s (max duration) not 1.2s (sum of durations) - assert elapsed_time < 0.8, f"Parallel execution took {elapsed_time}s, expected < 0.8s" + if not skip_timing_checks: + assert elapsed_time < 0.8, f"Parallel execution took {elapsed_time}s, expected < 0.8s" # Verify all nodes executed assert result.did_run(sleep1) and result.did_run(sleep2) and result.did_run(sleep3) @@ -150,7 +151,7 @@ class TestAsyncNodes: with pytest.raises(urllib.error.HTTPError): client.run(g) - def test_async_lazy_evaluation(self, client: ComfyClient, builder: GraphBuilder): + def test_async_lazy_evaluation(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): """Test async nodes with lazy evaluation.""" # Warmup execution to ensure server is fully initialized run_warmup(client, prefix="warmup_lazy") @@ -173,7 +174,8 @@ class TestAsyncNodes: elapsed_time = time.time() - start_time # Should only execute sleep1, not sleep2 - assert elapsed_time < 0.5, f"Should skip sleep2, took {elapsed_time}s" + if not skip_timing_checks: + assert elapsed_time < 0.5, f"Should skip sleep2, took {elapsed_time}s" assert result.did_run(sleep1), "Sleep1 should have executed" assert not result.did_run(sleep2), "Sleep2 should have been skipped" @@ -310,7 +312,7 @@ class TestAsyncNodes: images = result.get_images(output) assert len(images) == 1, "Should have blocked second image" - def test_async_caching_behavior(self, client: ComfyClient, builder: GraphBuilder): + def test_async_caching_behavior(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): """Test that async nodes are properly cached.""" # Warmup execution to ensure server is fully initialized run_warmup(client, prefix="warmup_cache") @@ -330,9 +332,10 @@ class TestAsyncNodes: elapsed_time = time.time() - start_time assert not result2.did_run(sleep_node), "Should be cached" - assert elapsed_time < 0.1, f"Cached run took {elapsed_time}s, should be instant" + if not skip_timing_checks: + assert elapsed_time < 0.1, f"Cached run took {elapsed_time}s, should be instant" - def test_async_with_dynamic_prompts(self, client: ComfyClient, builder: GraphBuilder): + def test_async_with_dynamic_prompts(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): """Test async nodes within dynamically generated prompts.""" # Warmup execution to ensure server is fully initialized run_warmup(client, prefix="warmup_dynamic") @@ -345,8 +348,8 @@ class TestAsyncNodes: dynamic_async = g.node("TestDynamicAsyncGeneration", image1=image1.out(0), image2=image2.out(0), - num_async_nodes=3, - sleep_duration=0.2) + num_async_nodes=5, + sleep_duration=0.4) g.node("SaveImage", images=dynamic_async.out(0)) start_time = time.time() @@ -354,7 +357,8 @@ class TestAsyncNodes: elapsed_time = time.time() - start_time # Should execute async nodes in parallel within dynamic prompt - assert elapsed_time < 0.5, f"Dynamic async execution took {elapsed_time}s" + if not skip_timing_checks: + assert elapsed_time < 1.0, f"Dynamic async execution took {elapsed_time}s" assert result.did_run(dynamic_async) def test_async_resource_cleanup(self, client: ComfyClient, builder: GraphBuilder): diff --git a/tests/inference/test_execution.py b/tests/execution/test_execution.py similarity index 98% rename from tests/inference/test_execution.py rename to tests/execution/test_execution.py index e7b29302e..8ea05fdd8 100644 --- a/tests/inference/test_execution.py +++ b/tests/execution/test_execution.py @@ -149,7 +149,7 @@ class TestExecution: '--output-directory', args_pytest["output_dir"], '--listen', args_pytest["listen"], '--port', str(args_pytest["port"]), - '--extra-model-paths-config', 'tests/inference/extra_model_paths.yaml', + '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml', '--cpu', ] use_lru, lru_size = request.param @@ -518,7 +518,7 @@ class TestExecution: assert numpy.array(images[0]).min() == 63 and numpy.array(images[0]).max() == 63, "Image should have value 0.25" assert not result.did_run(test_node), "The execution should have been cached" - def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder): + def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): # Warmup execution to ensure server is fully initialized run_warmup(client) @@ -541,14 +541,15 @@ class TestExecution: # The test should take around 3.0 seconds (the longest sleep duration) # plus some overhead, but definitely less than the sum of all sleeps (9.0s) - assert elapsed_time < 8.9, f"Parallel execution took {elapsed_time}s, expected less than 8.9s" + if not skip_timing_checks: + assert elapsed_time < 8.9, f"Parallel execution took {elapsed_time}s, expected less than 8.9s" # Verify that all nodes executed assert result.did_run(sleep_node1), "Sleep node 1 should have run" assert result.did_run(sleep_node2), "Sleep node 2 should have run" assert result.did_run(sleep_node3), "Sleep node 3 should have run" - def test_parallel_sleep_expansion(self, client: ComfyClient, builder: GraphBuilder): + def test_parallel_sleep_expansion(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): # Warmup execution to ensure server is fully initialized run_warmup(client) @@ -574,7 +575,9 @@ class TestExecution: # Similar to the previous test, expect parallel execution of the sleep nodes # which should complete in less than the sum of all sleeps - assert elapsed_time < 10.0, f"Expansion execution took {elapsed_time}s, expected less than 5.5s" + # Lots of leeway here since Windows CI is slow + if not skip_timing_checks: + assert elapsed_time < 13.0, f"Expansion execution took {elapsed_time}s" # Verify the parallel sleep node executed assert result.did_run(parallel_sleep), "ParallelSleep node should have run" diff --git a/tests/execution/test_progress_isolation.py b/tests/execution/test_progress_isolation.py new file mode 100644 index 000000000..93dc0d41b --- /dev/null +++ b/tests/execution/test_progress_isolation.py @@ -0,0 +1,233 @@ +"""Test that progress updates are properly isolated between WebSocket clients.""" + +import json +import pytest +import time +import threading +import uuid +import websocket +from typing import List, Dict, Any +from comfy_execution.graph_utils import GraphBuilder +from tests.execution.test_execution import ComfyClient + + +class ProgressTracker: + """Tracks progress messages received by a WebSocket client.""" + + def __init__(self, client_id: str): + self.client_id = client_id + self.progress_messages: List[Dict[str, Any]] = [] + self.lock = threading.Lock() + + def add_message(self, message: Dict[str, Any]): + """Thread-safe addition of progress messages.""" + with self.lock: + self.progress_messages.append(message) + + def get_messages_for_prompt(self, prompt_id: str) -> List[Dict[str, Any]]: + """Get all progress messages for a specific prompt_id.""" + with self.lock: + return [ + msg for msg in self.progress_messages + if msg.get('data', {}).get('prompt_id') == prompt_id + ] + + def has_cross_contamination(self, own_prompt_id: str) -> bool: + """Check if this client received progress for other prompts.""" + with self.lock: + for msg in self.progress_messages: + msg_prompt_id = msg.get('data', {}).get('prompt_id') + if msg_prompt_id and msg_prompt_id != own_prompt_id: + return True + return False + + +class IsolatedClient(ComfyClient): + """Extended ComfyClient that tracks all WebSocket messages.""" + + def __init__(self): + super().__init__() + self.progress_tracker = None + self.all_messages: List[Dict[str, Any]] = [] + + def connect(self, listen='127.0.0.1', port=8188, client_id=None): + """Connect with a specific client_id and set up message tracking.""" + if client_id is None: + client_id = str(uuid.uuid4()) + super().connect(listen, port, client_id) + self.progress_tracker = ProgressTracker(client_id) + + def listen_for_messages(self, duration: float = 5.0): + """Listen for WebSocket messages for a specified duration.""" + end_time = time.time() + duration + self.ws.settimeout(0.5) # Non-blocking with timeout + + while time.time() < end_time: + try: + out = self.ws.recv() + if isinstance(out, str): + message = json.loads(out) + self.all_messages.append(message) + + # Track progress_state messages + if message.get('type') == 'progress_state': + self.progress_tracker.add_message(message) + except websocket.WebSocketTimeoutException: + continue + except Exception: + # Log error silently in test context + break + + +@pytest.mark.execution +class TestProgressIsolation: + """Test suite for verifying progress update isolation between clients.""" + + @pytest.fixture(scope="class", autouse=True) + def _server(self, args_pytest): + """Start the ComfyUI server for testing.""" + import subprocess + pargs = [ + 'python', 'main.py', + '--output-directory', args_pytest["output_dir"], + '--listen', args_pytest["listen"], + '--port', str(args_pytest["port"]), + '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml', + '--cpu', + ] + p = subprocess.Popen(pargs) + yield + p.kill() + + def start_client_with_retry(self, listen: str, port: int, client_id: str = None): + """Start client with connection retries.""" + client = IsolatedClient() + # Connect to server (with retries) + n_tries = 5 + for i in range(n_tries): + time.sleep(4) + try: + client.connect(listen, port, client_id) + return client + except ConnectionRefusedError as e: + print(e) # noqa: T201 + print(f"({i+1}/{n_tries}) Retrying...") # noqa: T201 + raise ConnectionRefusedError(f"Failed to connect after {n_tries} attempts") + + def test_progress_isolation_between_clients(self, args_pytest): + """Test that progress updates are isolated between different clients.""" + listen = args_pytest["listen"] + port = args_pytest["port"] + + # Create two separate clients with unique IDs + client_a_id = "client_a_" + str(uuid.uuid4()) + client_b_id = "client_b_" + str(uuid.uuid4()) + + try: + # Connect both clients with retries + client_a = self.start_client_with_retry(listen, port, client_a_id) + client_b = self.start_client_with_retry(listen, port, client_b_id) + + # Create simple workflows for both clients + graph_a = GraphBuilder(prefix="client_a") + image_a = graph_a.node("StubImage", content="BLACK", height=256, width=256, batch_size=1) + graph_a.node("PreviewImage", images=image_a.out(0)) + + graph_b = GraphBuilder(prefix="client_b") + image_b = graph_b.node("StubImage", content="WHITE", height=256, width=256, batch_size=1) + graph_b.node("PreviewImage", images=image_b.out(0)) + + # Submit workflows from both clients + prompt_a = graph_a.finalize() + prompt_b = graph_b.finalize() + + response_a = client_a.queue_prompt(prompt_a) + prompt_id_a = response_a['prompt_id'] + + response_b = client_b.queue_prompt(prompt_b) + prompt_id_b = response_b['prompt_id'] + + # Start threads to listen for messages on both clients + def listen_client_a(): + client_a.listen_for_messages(duration=10.0) + + def listen_client_b(): + client_b.listen_for_messages(duration=10.0) + + thread_a = threading.Thread(target=listen_client_a) + thread_b = threading.Thread(target=listen_client_b) + + thread_a.start() + thread_b.start() + + # Wait for threads to complete + thread_a.join() + thread_b.join() + + # Verify isolation + # Client A should only receive progress for prompt_id_a + assert not client_a.progress_tracker.has_cross_contamination(prompt_id_a), \ + f"Client A received progress updates for other clients' workflows. " \ + f"Expected only {prompt_id_a}, but got messages for multiple prompts." + + # Client B should only receive progress for prompt_id_b + assert not client_b.progress_tracker.has_cross_contamination(prompt_id_b), \ + f"Client B received progress updates for other clients' workflows. " \ + f"Expected only {prompt_id_b}, but got messages for multiple prompts." + + # Verify each client received their own progress updates + client_a_messages = client_a.progress_tracker.get_messages_for_prompt(prompt_id_a) + client_b_messages = client_b.progress_tracker.get_messages_for_prompt(prompt_id_b) + + assert len(client_a_messages) > 0, \ + "Client A did not receive any progress updates for its own workflow" + assert len(client_b_messages) > 0, \ + "Client B did not receive any progress updates for its own workflow" + + # Ensure no cross-contamination + client_a_other = client_a.progress_tracker.get_messages_for_prompt(prompt_id_b) + client_b_other = client_b.progress_tracker.get_messages_for_prompt(prompt_id_a) + + assert len(client_a_other) == 0, \ + f"Client A incorrectly received {len(client_a_other)} progress updates for Client B's workflow" + assert len(client_b_other) == 0, \ + f"Client B incorrectly received {len(client_b_other)} progress updates for Client A's workflow" + + finally: + # Clean up connections + if hasattr(client_a, 'ws'): + client_a.ws.close() + if hasattr(client_b, 'ws'): + client_b.ws.close() + + def test_progress_with_missing_client_id(self, args_pytest): + """Test that progress updates handle missing client_id gracefully.""" + listen = args_pytest["listen"] + port = args_pytest["port"] + + try: + # Connect client with retries + client = self.start_client_with_retry(listen, port) + + # Create a simple workflow + graph = GraphBuilder(prefix="test_missing_id") + image = graph.node("StubImage", content="BLACK", height=128, width=128, batch_size=1) + graph.node("PreviewImage", images=image.out(0)) + + # Submit workflow + prompt = graph.finalize() + response = client.queue_prompt(prompt) + prompt_id = response['prompt_id'] + + # Listen for messages + client.listen_for_messages(duration=5.0) + + # Should still receive progress updates for own workflow + messages = client.progress_tracker.get_messages_for_prompt(prompt_id) + assert len(messages) > 0, \ + "Client did not receive progress updates even though it initiated the workflow" + + finally: + if hasattr(client, 'ws'): + client.ws.close() + diff --git a/tests/inference/testing_nodes/testing-pack/__init__.py b/tests/execution/testing_nodes/testing-pack/__init__.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/__init__.py rename to tests/execution/testing_nodes/testing-pack/__init__.py diff --git a/tests/inference/testing_nodes/testing-pack/api_test_nodes.py b/tests/execution/testing_nodes/testing-pack/api_test_nodes.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/api_test_nodes.py rename to tests/execution/testing_nodes/testing-pack/api_test_nodes.py diff --git a/tests/inference/testing_nodes/testing-pack/async_test_nodes.py b/tests/execution/testing_nodes/testing-pack/async_test_nodes.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/async_test_nodes.py rename to tests/execution/testing_nodes/testing-pack/async_test_nodes.py diff --git a/tests/inference/testing_nodes/testing-pack/conditions.py b/tests/execution/testing_nodes/testing-pack/conditions.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/conditions.py rename to tests/execution/testing_nodes/testing-pack/conditions.py diff --git a/tests/inference/testing_nodes/testing-pack/flow_control.py b/tests/execution/testing_nodes/testing-pack/flow_control.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/flow_control.py rename to tests/execution/testing_nodes/testing-pack/flow_control.py diff --git a/tests/inference/testing_nodes/testing-pack/specific_tests.py b/tests/execution/testing_nodes/testing-pack/specific_tests.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/specific_tests.py rename to tests/execution/testing_nodes/testing-pack/specific_tests.py diff --git a/tests/inference/testing_nodes/testing-pack/stubs.py b/tests/execution/testing_nodes/testing-pack/stubs.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/stubs.py rename to tests/execution/testing_nodes/testing-pack/stubs.py diff --git a/tests/inference/testing_nodes/testing-pack/tools.py b/tests/execution/testing_nodes/testing-pack/tools.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/tools.py rename to tests/execution/testing_nodes/testing-pack/tools.py diff --git a/tests/inference/testing_nodes/testing-pack/util.py b/tests/execution/testing_nodes/testing-pack/util.py similarity index 100% rename from tests/inference/testing_nodes/testing-pack/util.py rename to tests/execution/testing_nodes/testing-pack/util.py From 261421e21899abc8168c71efd8694ade020bcee2 Mon Sep 17 00:00:00 2001 From: "Yousef R. Gamaleldin" <81116377+yousef-rafat@users.noreply.github.com> Date: Fri, 5 Sep 2025 03:36:20 +0300 Subject: [PATCH 240/759] Add Hunyuan 3D 2.1 Support (#8714) --- comfy/clip_vision.py | 231 ++++++++- comfy/image_encoders/dino2.py | 33 +- comfy/image_encoders/dino2_large.json | 22 + comfy/latent_formats.py | 5 + comfy/ldm/hunyuan3d/vae.py | 569 ++++++++++++++++++---- comfy/ldm/hunyuan3dv2_1/hunyuandit.py | 658 ++++++++++++++++++++++++++ comfy/model_base.py | 17 + comfy/model_detection.py | 14 + comfy/sd.py | 49 +- comfy/supported_models.py | 13 +- comfy_extras/nodes_hunyuan3d.py | 24 +- nodes.py | 29 +- requirements.txt | 2 +- 13 files changed, 1537 insertions(+), 129 deletions(-) create mode 100644 comfy/image_encoders/dino2_large.json create mode 100644 comfy/ldm/hunyuan3dv2_1/hunyuandit.py diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index 2fa410cb7..4bc640e8b 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -17,10 +17,227 @@ class Output: def __setitem__(self, key, item): setattr(self, key, item) -def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True): + +def cubic_kernel(x, a: float = -0.75): + absx = x.abs() + absx2 = absx ** 2 + absx3 = absx ** 3 + + w = (a + 2) * absx3 - (a + 3) * absx2 + 1 + w2 = a * absx3 - 5*a * absx2 + 8*a * absx - 4*a + + return torch.where(absx <= 1, w, torch.where(absx < 2, w2, torch.zeros_like(x))) + +def get_indices_weights(in_size, out_size, scale): + # OpenCV-style half-pixel mapping + x = torch.arange(out_size, dtype=torch.float32) + x = (x + 0.5) / scale - 0.5 + + x0 = x.floor().long() + dx = x.unsqueeze(1) - (x0.unsqueeze(1) + torch.arange(-1, 3)) + + weights = cubic_kernel(dx) + weights = weights / weights.sum(dim=1, keepdim=True) + + indices = x0.unsqueeze(1) + torch.arange(-1, 3) + indices = indices.clamp(0, in_size - 1) + + return indices, weights + +def resize_cubic_1d(x, out_size, dim): + b, c, h, w = x.shape + in_size = h if dim == 2 else w + scale = out_size / in_size + + indices, weights = get_indices_weights(in_size, out_size, scale) + + if dim == 2: + x = x.permute(0, 1, 3, 2) + x = x.reshape(-1, h) + else: + x = x.reshape(-1, w) + + gathered = x[:, indices] + out = (gathered * weights.unsqueeze(0)).sum(dim=2) + + if dim == 2: + out = out.reshape(b, c, w, out_size).permute(0, 1, 3, 2) + else: + out = out.reshape(b, c, h, out_size) + + return out + +def resize_cubic(img: torch.Tensor, size: tuple) -> torch.Tensor: + """ + Resize image using OpenCV-equivalent INTER_CUBIC interpolation. + Implemented in pure PyTorch + """ + + if img.ndim == 3: + img = img.unsqueeze(0) + + img = img.permute(0, 3, 1, 2) + + out_h, out_w = size + img = resize_cubic_1d(img, out_h, dim=2) + img = resize_cubic_1d(img, out_w, dim=3) + return img + +def resize_area(img: torch.Tensor, size: tuple) -> torch.Tensor: + # vectorized implementation for OpenCV's INTER_AREA using pure PyTorch + original_shape = img.shape + is_hwc = False + + if img.ndim == 3: + if img.shape[0] <= 4: + img = img.unsqueeze(0) + else: + is_hwc = True + img = img.permute(2, 0, 1).unsqueeze(0) + elif img.ndim == 4: + pass + else: + raise ValueError("Expected image with 3 or 4 dims.") + + B, C, H, W = img.shape + out_h, out_w = size + scale_y = H / out_h + scale_x = W / out_w + + device = img.device + + # compute the grid boundries + y_start = torch.arange(out_h, device=device).float() * scale_y + y_end = y_start + scale_y + x_start = torch.arange(out_w, device=device).float() * scale_x + x_end = x_start + scale_x + + # for each output pixel, we will compute the range for it + y_start_int = torch.floor(y_start).long() + y_end_int = torch.ceil(y_end).long() + x_start_int = torch.floor(x_start).long() + x_end_int = torch.ceil(x_end).long() + + # We will build the weighted sums by iterating over contributing input pixels once + output = torch.zeros((B, C, out_h, out_w), dtype=torch.float32, device=device) + area = torch.zeros((out_h, out_w), dtype=torch.float32, device=device) + + max_kernel_h = int(torch.max(y_end_int - y_start_int).item()) + max_kernel_w = int(torch.max(x_end_int - x_start_int).item()) + + for dy in range(max_kernel_h): + for dx in range(max_kernel_w): + # compute the weights for this offset for all output pixels + + y_idx = y_start_int.unsqueeze(1) + dy + x_idx = x_start_int.unsqueeze(0) + dx + + # clamp indices to image boundaries + y_idx_clamped = torch.clamp(y_idx, 0, H - 1) + x_idx_clamped = torch.clamp(x_idx, 0, W - 1) + + # compute weights by broadcasting + y_weight = (torch.min(y_end.unsqueeze(1), y_idx_clamped.float() + 1.0) - torch.max(y_start.unsqueeze(1), y_idx_clamped.float())).clamp(min=0) + x_weight = (torch.min(x_end.unsqueeze(0), x_idx_clamped.float() + 1.0) - torch.max(x_start.unsqueeze(0), x_idx_clamped.float())).clamp(min=0) + + weight = (y_weight * x_weight) + + y_expand = y_idx_clamped.expand(out_h, out_w) + x_expand = x_idx_clamped.expand(out_h, out_w) + + + pixels = img[:, :, y_expand, x_expand] + + # unsqueeze to broadcast + w = weight.unsqueeze(0).unsqueeze(0) + + output += pixels * w + area += weight + + # Normalize by area + output /= area.unsqueeze(0).unsqueeze(0) + + if is_hwc: + return output[0].permute(1, 2, 0) + elif img.shape[0] == 1 and original_shape[0] <= 4: + return output[0] + else: + return output + +def recenter(image, border_ratio: float = 0.2): + + if image.shape[-1] == 4: + mask = image[..., 3] + else: + mask = torch.ones_like(image[..., 0:1]) * 255 + image = torch.concatenate([image, mask], axis=-1) + mask = mask[..., 0] + + H, W, C = image.shape + + size = max(H, W) + result = torch.zeros((size, size, C), dtype = torch.uint8) + + # as_tuple to match numpy behaviour + x_coords, y_coords = torch.nonzero(mask, as_tuple=True) + + y_min, y_max = y_coords.min(), y_coords.max() + x_min, x_max = x_coords.min(), x_coords.max() + + h = x_max - x_min + w = y_max - y_min + + if h == 0 or w == 0: + raise ValueError('input image is empty') + + desired_size = int(size * (1 - border_ratio)) + scale = desired_size / max(h, w) + + h2 = int(h * scale) + w2 = int(w * scale) + + x2_min = (size - h2) // 2 + x2_max = x2_min + h2 + + y2_min = (size - w2) // 2 + y2_max = y2_min + w2 + + # note: opencv takes columns first (opposite to pytorch and numpy that take the row first) + result[x2_min:x2_max, y2_min:y2_max] = resize_area(image[x_min:x_max, y_min:y_max], (h2, w2)) + + bg = torch.ones((result.shape[0], result.shape[1], 3), dtype = torch.uint8) * 255 + + mask = result[..., 3:].to(torch.float32) / 255 + result = result[..., :3] * mask + bg * (1 - mask) + + mask = mask * 255 + result = result.clip(0, 255).to(torch.uint8) + mask = mask.clip(0, 255).to(torch.uint8) + + return result + +def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], + crop=True, value_range = (-1, 1), border_ratio: float = None, recenter_size: int = 512): + + if border_ratio is not None: + + image = (image * 255).clamp(0, 255).to(torch.uint8) + image = [recenter(img, border_ratio = border_ratio) for img in image] + + image = torch.stack(image, dim = 0) + image = resize_cubic(image, size = (recenter_size, recenter_size)) + + image = image / 255 * 2 - 1 + low, high = value_range + + image = (image - low) / (high - low) + image = image.permute(0, 2, 3, 1) + image = image[:, :, :, :3] if image.shape[3] > 3 else image + mean = torch.tensor(mean, device=image.device, dtype=image.dtype) std = torch.tensor(std, device=image.device, dtype=image.dtype) + image = image.movedim(-1, 1) if not (image.shape[2] == size and image.shape[3] == size): if crop: @@ -29,7 +246,7 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s else: scale_size = (size, size) - image = torch.nn.functional.interpolate(image, size=scale_size, mode="bicubic", antialias=True) + image = torch.nn.functional.interpolate(image, size=scale_size, mode="bilinear" if border_ratio is not None else "bicubic", antialias=True) h = (image.shape[2] - size)//2 w = (image.shape[3] - size)//2 image = image[:,:,h:h+size,w:w+size] @@ -71,9 +288,9 @@ class ClipVisionModel(): def get_sd(self): return self.model.state_dict() - def encode_image(self, image, crop=True): + def encode_image(self, image, crop=True, border_ratio: float = None): comfy.model_management.load_model_gpu(self.patcher) - pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float() + pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop, border_ratio=border_ratio).float() out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2) outputs = Output() @@ -136,8 +353,12 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False): json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json") else: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json") - elif "embeddings.patch_embeddings.projection.weight" in sd: + + # Dinov2 + elif 'encoder.layer.39.layer_scale2.lambda1' in sd: json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_giant.json") + elif 'encoder.layer.23.layer_scale2.lambda1' in sd: + json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_large.json") else: return None diff --git a/comfy/image_encoders/dino2.py b/comfy/image_encoders/dino2.py index 976f98c65..9b6dace9d 100644 --- a/comfy/image_encoders/dino2.py +++ b/comfy/image_encoders/dino2.py @@ -31,6 +31,20 @@ class LayerScale(torch.nn.Module): def forward(self, x): return x * comfy.model_management.cast_to_device(self.lambda1, x.device, x.dtype) +class Dinov2MLP(torch.nn.Module): + def __init__(self, hidden_size: int, dtype, device, operations): + super().__init__() + + mlp_ratio = 4 + hidden_features = int(hidden_size * mlp_ratio) + self.fc1 = operations.Linear(hidden_size, hidden_features, bias = True, device=device, dtype=dtype) + self.fc2 = operations.Linear(hidden_features, hidden_size, bias = True, device=device, dtype=dtype) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.fc1(hidden_state) + hidden_state = torch.nn.functional.gelu(hidden_state) + hidden_state = self.fc2(hidden_state) + return hidden_state class SwiGLUFFN(torch.nn.Module): def __init__(self, dim, dtype, device, operations): @@ -50,12 +64,15 @@ class SwiGLUFFN(torch.nn.Module): class Dino2Block(torch.nn.Module): - def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations): + def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn): super().__init__() self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations) self.layer_scale1 = LayerScale(dim, dtype, device, operations) self.layer_scale2 = LayerScale(dim, dtype, device, operations) - self.mlp = SwiGLUFFN(dim, dtype, device, operations) + if use_swiglu_ffn: + self.mlp = SwiGLUFFN(dim, dtype, device, operations) + else: + self.mlp = Dinov2MLP(dim, dtype, device, operations) self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device) self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device) @@ -66,9 +83,10 @@ class Dino2Block(torch.nn.Module): class Dino2Encoder(torch.nn.Module): - def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations): + def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn): super().__init__() - self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations) for _ in range(num_layers)]) + self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn) + for _ in range(num_layers)]) def forward(self, x, intermediate_output=None): optimized_attention = optimized_attention_for_device(x.device, False, small_input=True) @@ -78,8 +96,8 @@ class Dino2Encoder(torch.nn.Module): intermediate_output = len(self.layer) + intermediate_output intermediate = None - for i, l in enumerate(self.layer): - x = l(x, optimized_attention) + for i, layer in enumerate(self.layer): + x = layer(x, optimized_attention) if i == intermediate_output: intermediate = x.clone() return x, intermediate @@ -128,9 +146,10 @@ class Dinov2Model(torch.nn.Module): dim = config_dict["hidden_size"] heads = config_dict["num_attention_heads"] layer_norm_eps = config_dict["layer_norm_eps"] + use_swiglu_ffn = config_dict["use_swiglu_ffn"] self.embeddings = Dino2Embeddings(dim, dtype, device, operations) - self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations) + self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations, use_swiglu_ffn = use_swiglu_ffn) self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device) def forward(self, pixel_values, attention_mask=None, intermediate_output=None): diff --git a/comfy/image_encoders/dino2_large.json b/comfy/image_encoders/dino2_large.json new file mode 100644 index 000000000..43fbb58ff --- /dev/null +++ b/comfy/image_encoders/dino2_large.json @@ -0,0 +1,22 @@ +{ + "hidden_size": 1024, + "use_mask_token": true, + "patch_size": 14, + "image_size": 518, + "num_channels": 3, + "num_attention_heads": 16, + "initializer_range": 0.02, + "attention_probs_dropout_prob": 0.0, + "hidden_dropout_prob": 0.0, + "hidden_act": "gelu", + "mlp_ratio": 4, + "model_type": "dinov2", + "num_hidden_layers": 24, + "layer_norm_eps": 1e-6, + "qkv_bias": true, + "use_swiglu_ffn": false, + "layerscale_value": 1.0, + "drop_path_rate": 0.0, + "image_mean": [0.485, 0.456, 0.406], + "image_std": [0.229, 0.224, 0.225] +} diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index caf4991fc..0d84994b0 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -538,6 +538,11 @@ class Hunyuan3Dv2(LatentFormat): latent_dimensions = 1 scale_factor = 0.9990943042622529 +class Hunyuan3Dv2_1(LatentFormat): + scale_factor = 1.0039506158752403 + latent_channels = 64 + latent_dimensions = 1 + class Hunyuan3Dv2mini(LatentFormat): latent_channels = 64 latent_dimensions = 1 diff --git a/comfy/ldm/hunyuan3d/vae.py b/comfy/ldm/hunyuan3d/vae.py index 6e8cbf1d9..760944827 100644 --- a/comfy/ldm/hunyuan3d/vae.py +++ b/comfy/ldm/hunyuan3d/vae.py @@ -4,81 +4,458 @@ import torch import torch.nn as nn import torch.nn.functional as F - - -from typing import Union, Tuple, List, Callable, Optional - import numpy as np -from einops import repeat, rearrange +import math from tqdm import tqdm + +from typing import Optional + import logging import comfy.ops ops = comfy.ops.disable_weight_init -def generate_dense_grid_points( - bbox_min: np.ndarray, - bbox_max: np.ndarray, - octree_resolution: int, - indexing: str = "ij", -): - length = bbox_max - bbox_min - num_cells = octree_resolution +def fps(src: torch.Tensor, batch: torch.Tensor, sampling_ratio: float, start_random: bool = True): - x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32) - y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32) - z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32) - [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing) - xyz = np.stack((xs, ys, zs), axis=-1) - grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1] + # manually create the pointer vector + assert src.size(0) == batch.numel() - return xyz, grid_size, length + batch_size = int(batch.max()) + 1 + deg = src.new_zeros(batch_size, dtype = torch.long) + + deg.scatter_add_(0, batch, torch.ones_like(batch)) + + ptr_vec = deg.new_zeros(batch_size + 1) + torch.cumsum(deg, 0, out=ptr_vec[1:]) + + #return fps_sampling(src, ptr_vec, ratio) + sampled_indicies = [] + + for b in range(batch_size): + # start and the end of each batch + start, end = ptr_vec[b].item(), ptr_vec[b + 1].item() + # points from the point cloud + points = src[start:end] + + num_points = points.size(0) + num_samples = max(1, math.ceil(num_points * sampling_ratio)) + + selected = torch.zeros(num_samples, device = src.device, dtype = torch.long) + distances = torch.full((num_points,), float("inf"), device = src.device) + + # select a random start point + if start_random: + farthest = torch.randint(0, num_points, (1,), device = src.device) + else: + farthest = torch.tensor([0], device = src.device, dtype = torch.long) + + for i in range(num_samples): + selected[i] = farthest + centroid = points[farthest].squeeze(0) + dist = torch.norm(points - centroid, dim = 1) # compute euclidean distance + distances = torch.minimum(distances, dist) + farthest = torch.argmax(distances) + + sampled_indicies.append(torch.arange(start, end)[selected]) + + return torch.cat(sampled_indicies, dim = 0) +class PointCrossAttention(nn.Module): + def __init__(self, + num_latents: int, + downsample_ratio: float, + pc_size: int, + pc_sharpedge_size: int, + point_feats: int, + width: int, + heads: int, + layers: int, + fourier_embedder, + normal_pe: bool = False, + qkv_bias: bool = False, + use_ln_post: bool = True, + qk_norm: bool = True): + + super().__init__() + + self.fourier_embedder = fourier_embedder + + self.pc_size = pc_size + self.normal_pe = normal_pe + self.downsample_ratio = downsample_ratio + self.pc_sharpedge_size = pc_sharpedge_size + self.num_latents = num_latents + self.point_feats = point_feats + + self.input_proj = nn.Linear(self.fourier_embedder.out_dim + point_feats, width) + + self.cross_attn = ResidualCrossAttentionBlock( + width = width, + heads = heads, + qkv_bias = qkv_bias, + qk_norm = qk_norm + ) + + self.self_attn = None + if layers > 0: + self.self_attn = Transformer( + width = width, + heads = heads, + qkv_bias = qkv_bias, + qk_norm = qk_norm, + layers = layers + ) + + if use_ln_post: + self.ln_post = nn.LayerNorm(width) + else: + self.ln_post = None + + def sample_points_and_latents(self, point_cloud: torch.Tensor, features: torch.Tensor): + + """ + Subsample points randomly from the point cloud (input_pc) + Further sample the subsampled points to get query_pc + take the fourier embeddings for both input and query pc + + Mental Note: FPS-sampled points (query_pc) act as latent tokens that attend to and learn from the broader context in input_pc. + Goal: get a smaller represenation (query_pc) to represent the entire scence structure by learning from a broader subset (input_pc). + More computationally efficient. + + Features are additional information for each point in the cloud + """ + + B, _, D = point_cloud.shape + + num_latents = int(self.num_latents) + + num_random_query = self.pc_size / (self.pc_size + self.pc_sharpedge_size) * num_latents + num_sharpedge_query = num_latents - num_random_query + + # Split random and sharpedge surface points + random_pc, sharpedge_pc = torch.split(point_cloud, [self.pc_size, self.pc_sharpedge_size], dim=1) + + # assert statements + assert random_pc.shape[1] <= self.pc_size, "Random surface points size must be less than or equal to pc_size" + assert sharpedge_pc.shape[1] <= self.pc_sharpedge_size, "Sharpedge surface points size must be less than or equal to pc_sharpedge_size" + + input_random_pc_size = int(num_random_query * self.downsample_ratio) + random_query_pc, random_input_pc, random_idx_pc, random_idx_query = \ + self.subsample(pc = random_pc, num_query = num_random_query, input_pc_size = input_random_pc_size) + + input_sharpedge_pc_size = int(num_sharpedge_query * self.downsample_ratio) + + if input_sharpedge_pc_size == 0: + sharpedge_input_pc = torch.zeros(B, 0, D, dtype = random_input_pc.dtype).to(point_cloud.device) + sharpedge_query_pc = torch.zeros(B, 0, D, dtype= random_query_pc.dtype).to(point_cloud.device) + + else: + sharpedge_query_pc, sharpedge_input_pc, sharpedge_idx_pc, sharpedge_idx_query = \ + self.subsample(pc = sharpedge_pc, num_query = num_sharpedge_query, input_pc_size = input_sharpedge_pc_size) + + # concat the random and sharpedges + query_pc = torch.cat([random_query_pc, sharpedge_query_pc], dim = 1) + input_pc = torch.cat([random_input_pc, sharpedge_input_pc], dim = 1) + + query = self.fourier_embedder(query_pc) + data = self.fourier_embedder(input_pc) + + if self.point_feats > 0: + random_surface_features, sharpedge_surface_features = torch.split(features, [self.pc_size, self.pc_sharpedge_size], dim = 1) + + input_random_surface_features, query_random_features = \ + self.handle_features(features = random_surface_features, idx_pc = random_idx_pc, batch_size = B, + input_pc_size = input_random_pc_size, idx_query = random_idx_query) + + if input_sharpedge_pc_size == 0: + input_sharpedge_surface_features = torch.zeros(B, 0, self.point_feats, + dtype = input_random_surface_features.dtype, device = point_cloud.device) + + query_sharpedge_features = torch.zeros(B, 0, self.point_feats, + dtype = query_random_features.dtype, device = point_cloud.device) + else: + + input_sharpedge_surface_features, query_sharpedge_features = \ + self.handle_features(idx_pc = sharpedge_idx_pc, features = sharpedge_surface_features, + batch_size = B, idx_query = sharpedge_idx_query, input_pc_size = input_sharpedge_pc_size) + + query_features = torch.cat([query_random_features, query_sharpedge_features], dim = 1) + input_features = torch.cat([input_random_surface_features, input_sharpedge_surface_features], dim = 1) + + if self.normal_pe: + # apply the fourier embeddings on the first 3 dims (xyz) + input_features_pe = self.fourier_embedder(input_features[..., :3]) + query_features_pe = self.fourier_embedder(query_features[..., :3]) + # replace the first 3 dims with the new PE ones + input_features = torch.cat([input_features_pe, input_features[..., :3]], dim = -1) + query_features = torch.cat([query_features_pe, query_features[..., :3]], dim = -1) + + # concat at the channels dim + query = torch.cat([query, query_features], dim = -1) + data = torch.cat([data, input_features], dim = -1) + + # don't return pc_info to avoid unnecessary memory usuage + return query.view(B, -1, query.shape[-1]), data.view(B, -1, data.shape[-1]) + + def forward(self, point_cloud: torch.Tensor, features: torch.Tensor): + + query, data = self.sample_points_and_latents(point_cloud = point_cloud, features = features) + + # apply projections + query = self.input_proj(query) + data = self.input_proj(data) + + # apply cross attention between query and data + latents = self.cross_attn(query, data) + + if self.self_attn is not None: + latents = self.self_attn(latents) + + if self.ln_post is not None: + latents = self.ln_post(latents) + + return latents -class VanillaVolumeDecoder: + def subsample(self, pc, num_query, input_pc_size: int): + + """ + num_query: number of points to keep after FPS + input_pc_size: number of points to select before FPS + """ + + B, _, D = pc.shape + query_ratio = num_query / input_pc_size + + # random subsampling of points inside the point cloud + idx_pc = torch.randperm(pc.shape[1], device = pc.device)[:input_pc_size] + input_pc = pc[:, idx_pc, :] + + # flatten to allow applying fps across the whole batch + flattent_input_pc = input_pc.view(B * input_pc_size, D) + + # construct a batch_down tensor to tell fps + # which points belong to which batch + N_down = int(flattent_input_pc.shape[0] / B) + batch_down = torch.arange(B).to(pc.device) + batch_down = torch.repeat_interleave(batch_down, N_down) + + idx_query = fps(flattent_input_pc, batch_down, sampling_ratio = query_ratio) + query_pc = flattent_input_pc[idx_query].view(B, -1, D) + + return query_pc, input_pc, idx_pc, idx_query + + def handle_features(self, features, idx_pc, input_pc_size, batch_size: int, idx_query): + + B = batch_size + + input_surface_features = features[:, idx_pc, :] + flattent_input_features = input_surface_features.view(B * input_pc_size, -1) + query_features = flattent_input_features[idx_query].view(B, -1, + flattent_input_features.shape[-1]) + + return input_surface_features, query_features + +def normalize_mesh(mesh, scale = 0.9999): + """Normalize mesh to fit in [-scale, scale]. Translate mesh so its center is [0,0,0]""" + + bbox = mesh.bounds + center = (bbox[1] + bbox[0]) / 2 + + max_extent = (bbox[1] - bbox[0]).max() + mesh.apply_translation(-center) + mesh.apply_scale((2 * scale) / max_extent) + + return mesh + +def sample_pointcloud(mesh, num = 200000): + """ Uniformly sample points from the surface of the mesh """ + + points, face_idx = mesh.sample(num, return_index = True) + normals = mesh.face_normals[face_idx] + return torch.from_numpy(points.astype(np.float32)), torch.from_numpy(normals.astype(np.float32)) + +def detect_sharp_edges(mesh, threshold=0.985): + """Return edge indices (a, b) that lie on sharp boundaries of the mesh.""" + + V, F = mesh.vertices, mesh.faces + VN, FN = mesh.vertex_normals, mesh.face_normals + + sharp_mask = np.ones(V.shape[0]) + for i in range(3): + indices = F[:, i] + alignment = np.einsum('ij,ij->i', VN[indices], FN) + dot_stack = np.stack((sharp_mask[indices], alignment), axis=-1) + sharp_mask[indices] = np.min(dot_stack, axis=-1) + + edge_a = np.concatenate([F[:, 0], F[:, 1], F[:, 2]]) + edge_b = np.concatenate([F[:, 1], F[:, 2], F[:, 0]]) + sharp_edges = (sharp_mask[edge_a] < threshold) & (sharp_mask[edge_b] < threshold) + + return edge_a[sharp_edges], edge_b[sharp_edges] + + +def sharp_sample_pointcloud(mesh, num = 16384): + """ Sample points preferentially from sharp edges in the mesh. """ + + edge_a, edge_b = detect_sharp_edges(mesh) + V, VN = mesh.vertices, mesh.vertex_normals + + va, vb = V[edge_a], V[edge_b] + na, nb = VN[edge_a], VN[edge_b] + + edge_lengths = np.linalg.norm(vb - va, axis=-1) + weights = edge_lengths / edge_lengths.sum() + + indices = np.searchsorted(np.cumsum(weights), np.random.rand(num)) + t = np.random.rand(num, 1) + + samples = t * va[indices] + (1 - t) * vb[indices] + normals = t * na[indices] + (1 - t) * nb[indices] + + return samples.astype(np.float32), normals.astype(np.float32) + +def load_surface_sharpedge(mesh, num_points=4096, num_sharp_points=4096, sharpedge_flag = True, device = "cuda"): + """Load a surface with optional sharp-edge annotations from a trimesh mesh.""" + + import trimesh + + try: + mesh_full = trimesh.util.concatenate(mesh.dump()) + except Exception: + mesh_full = trimesh.util.concatenate(mesh) + + mesh_full = normalize_mesh(mesh_full) + + faces = mesh_full.faces + vertices = mesh_full.vertices + origin_face_count = faces.shape[0] + + mesh_surface = trimesh.Trimesh(vertices=vertices, faces=faces[:origin_face_count]) + mesh_fill = trimesh.Trimesh(vertices=vertices, faces=faces[origin_face_count:]) + + area_surface = mesh_surface.area + area_fill = mesh_fill.area + total_area = area_surface + area_fill + + sample_num = 499712 // 2 + fill_ratio = area_fill / total_area if total_area > 0 else 0 + + num_fill = int(sample_num * fill_ratio) + num_surface = sample_num - num_fill + + surf_pts, surf_normals = sample_pointcloud(mesh_surface, num_surface) + fill_pts, fill_normals = (torch.zeros(0, 3), torch.zeros(0, 3)) if num_fill == 0 else sample_pointcloud(mesh_fill, num_fill) + + sharp_pts, sharp_normals = sharp_sample_pointcloud(mesh_surface, sample_num) + + def assemble_tensor(points, normals, label=None): + + data = torch.cat([points, normals], dim=1).half().to(device) + + if label is not None: + label_tensor = torch.full((data.shape[0], 1), float(label), dtype=torch.float16).to(device) + data = torch.cat([data, label_tensor], dim=1) + + return data + + surface = assemble_tensor(torch.cat([surf_pts.to(device), fill_pts.to(device)], dim=0), + torch.cat([surf_normals.to(device), fill_normals.to(device)], dim=0), + label = 0 if sharpedge_flag else None) + + sharp_surface = assemble_tensor(torch.from_numpy(sharp_pts), torch.from_numpy(sharp_normals), + label = 1 if sharpedge_flag else None) + + rng = np.random.default_rng() + + surface = surface[rng.choice(surface.shape[0], num_points, replace = False)] + sharp_surface = sharp_surface[rng.choice(sharp_surface.shape[0], num_sharp_points, replace = False)] + + full = torch.cat([surface, sharp_surface], dim = 0).unsqueeze(0) + + return full + +class SharpEdgeSurfaceLoader: + """ Load mesh surface and sharp edge samples. """ + + def __init__(self, num_uniform_points = 8192, num_sharp_points = 8192): + + self.num_uniform_points = num_uniform_points + self.num_sharp_points = num_sharp_points + self.total_points = num_uniform_points + num_sharp_points + + def __call__(self, mesh_input, device = "cuda"): + mesh = self._load_mesh(mesh_input) + return load_surface_sharpedge(mesh, self.num_uniform_points, self.num_sharp_points, device = device) + + @staticmethod + def _load_mesh(mesh_input): + import trimesh + + if isinstance(mesh_input, str): + mesh = trimesh.load(mesh_input, force="mesh", merge_primitives = True) + else: + mesh = mesh_input + + if isinstance(mesh, trimesh.Scene): + combined = None + for obj in mesh.geometry.values(): + combined = obj if combined is None else combined + obj + return combined + + return mesh + +class DiagonalGaussianDistribution: + def __init__(self, params: torch.Tensor, feature_dim: int = -1): + + # divide quant channels (8) into mean and log variance + self.mean, self.logvar = torch.chunk(params, 2, dim = feature_dim) + + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.std = torch.exp(0.5 * self.logvar) + + def sample(self): + + eps = torch.randn_like(self.std) + z = self.mean + eps * self.std + + return z + +################################################ +# Volume Decoder +################################################ + +class VanillaVolumeDecoder(): @torch.no_grad() - def __call__( - self, - latents: torch.FloatTensor, - geo_decoder: Callable, - bounds: Union[Tuple[float], List[float], float] = 1.01, - num_chunks: int = 10000, - octree_resolution: int = None, - enable_pbar: bool = True, - **kwargs, - ): - device = latents.device - dtype = latents.dtype - batch_size = latents.shape[0] + def __call__(self, latents: torch.Tensor, geo_decoder: callable, octree_resolution: int, bounds = 1.01, + num_chunks: int = 10_000, enable_pbar: bool = True, **kwargs): - # 1. generate query points if isinstance(bounds, float): bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds] - bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6]) - xyz_samples, grid_size, length = generate_dense_grid_points( - bbox_min=bbox_min, - bbox_max=bbox_max, - octree_resolution=octree_resolution, - indexing="ij" - ) - xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3) + bbox_min, bbox_max = torch.tensor(bounds[:3]), torch.tensor(bounds[3:]) + + x = torch.linspace(bbox_min[0], bbox_max[0], int(octree_resolution) + 1, dtype = torch.float32) + y = torch.linspace(bbox_min[1], bbox_max[1], int(octree_resolution) + 1, dtype = torch.float32) + z = torch.linspace(bbox_min[2], bbox_max[2], int(octree_resolution) + 1, dtype = torch.float32) + + [xs, ys, zs] = torch.meshgrid(x, y, z, indexing = "ij") + xyz = torch.stack((xs, ys, zs), axis=-1).to(latents.device, dtype = latents.dtype).contiguous().reshape(-1, 3) + grid_size = [int(octree_resolution) + 1, int(octree_resolution) + 1, int(octree_resolution) + 1] - # 2. latents to 3d volume batch_logits = [] - for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), desc="Volume Decoding", + for start in tqdm(range(0, xyz.shape[0], num_chunks), desc="Volume Decoding", disable=not enable_pbar): - chunk_queries = xyz_samples[start: start + num_chunks, :] - chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size) - logits = geo_decoder(queries=chunk_queries, latents=latents) + + chunk_queries = xyz[start: start + num_chunks, :] + chunk_queries = chunk_queries.unsqueeze(0).repeat(latents.shape[0], 1, 1) + logits = geo_decoder(queries = chunk_queries, latents = latents) batch_logits.append(logits) - grid_logits = torch.cat(batch_logits, dim=1) - grid_logits = grid_logits.view((batch_size, *grid_size)).float() + grid_logits = torch.cat(batch_logits, dim = 1) + grid_logits = grid_logits.view((latents.shape[0], *grid_size)).float() return grid_logits - class FourierEmbedder(nn.Module): """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts each feature dimension of `x[..., i]` into: @@ -175,13 +552,11 @@ class FourierEmbedder(nn.Module): else: return x - class CrossAttentionProcessor: def __call__(self, attn, q, k, v): out = comfy.ops.scaled_dot_product_attention(q, k, v) return out - class DropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). """ @@ -232,38 +607,41 @@ class MLP(nn.Module): def forward(self, x): return self.drop_path(self.c_proj(self.gelu(self.c_fc(x)))) - class QKVMultiheadCrossAttention(nn.Module): def __init__( self, - *, heads: int, + n_data = None, width=None, qk_norm=False, norm_layer=ops.LayerNorm ): super().__init__() self.heads = heads + self.n_data = n_data self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() - self.attn_processor = CrossAttentionProcessor() - def forward(self, q, kv): + _, n_ctx, _ = q.shape bs, n_data, width = kv.shape + attn_ch = width // self.heads // 2 q = q.view(bs, n_ctx, self.heads, -1) + kv = kv.view(bs, n_data, self.heads, -1) k, v = torch.split(kv, attn_ch, dim=-1) q = self.q_norm(q) k = self.k_norm(k) - q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) - out = self.attn_processor(self, q, k, v) - out = out.transpose(1, 2).reshape(bs, n_ctx, -1) - return out + q, k, v = [t.permute(0, 2, 1, 3) for t in (q, k, v)] + out = F.scaled_dot_product_attention(q, k, v) + + out = out.transpose(1, 2).reshape(bs, n_ctx, -1) + + return out class MultiheadCrossAttention(nn.Module): def __init__( @@ -306,7 +684,6 @@ class MultiheadCrossAttention(nn.Module): x = self.c_proj(x) return x - class ResidualCrossAttentionBlock(nn.Module): def __init__( self, @@ -366,7 +743,7 @@ class QKVMultiheadAttention(nn.Module): q = self.q_norm(q) k = self.k_norm(k) - q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) + q, k, v = [t.permute(0, 2, 1, 3) for t in (q, k, v)] out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1) return out @@ -383,8 +760,7 @@ class MultiheadAttention(nn.Module): drop_path_rate: float = 0.0 ): super().__init__() - self.width = width - self.heads = heads + self.c_qkv = ops.Linear(width, width * 3, bias=qkv_bias) self.c_proj = ops.Linear(width, width) self.attention = QKVMultiheadAttention( @@ -491,7 +867,7 @@ class CrossAttentionDecoder(nn.Module): self.query_proj = ops.Linear(self.fourier_embedder.out_dim, width) if self.downsample_ratio != 1: self.latents_proj = ops.Linear(width * downsample_ratio, width) - if self.enable_ln_post == False: + if not self.enable_ln_post: qk_norm = False self.cross_attn_decoder = ResidualCrossAttentionBlock( width=width, @@ -522,28 +898,44 @@ class CrossAttentionDecoder(nn.Module): class ShapeVAE(nn.Module): def __init__( - self, - *, - embed_dim: int, - width: int, - heads: int, - num_decoder_layers: int, - geo_decoder_downsample_ratio: int = 1, - geo_decoder_mlp_expand_ratio: int = 4, - geo_decoder_ln_post: bool = True, - num_freqs: int = 8, - include_pi: bool = True, - qkv_bias: bool = True, - qk_norm: bool = False, - label_type: str = "binary", - drop_path_rate: float = 0.0, - scale_factor: float = 1.0, + self, + *, + num_latents: int = 4096, + embed_dim: int = 64, + width: int = 1024, + heads: int = 16, + num_decoder_layers: int = 16, + num_encoder_layers: int = 8, + pc_size: int = 81920, + pc_sharpedge_size: int = 0, + point_feats: int = 4, + downsample_ratio: int = 20, + geo_decoder_downsample_ratio: int = 1, + geo_decoder_mlp_expand_ratio: int = 4, + geo_decoder_ln_post: bool = True, + num_freqs: int = 8, + qkv_bias: bool = False, + qk_norm: bool = True, + drop_path_rate: float = 0.0, + include_pi: bool = False, + scale_factor: float = 1.0039506158752403, + label_type: str = "binary", ): super().__init__() self.geo_decoder_ln_post = geo_decoder_ln_post self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi) + self.encoder = PointCrossAttention(layers = num_encoder_layers, + num_latents = num_latents, + downsample_ratio = downsample_ratio, + heads = heads, + pc_size = pc_size, + width = width, + point_feats = point_feats, + fourier_embedder = self.fourier_embedder, + pc_sharpedge_size = pc_sharpedge_size) + self.post_kl = ops.Linear(embed_dim, width) self.transformer = Transformer( @@ -583,5 +975,14 @@ class ShapeVAE(nn.Module): grid_logits = self.volume_decoder(latents, self.geo_decoder, bounds=bounds, num_chunks=num_chunks, octree_resolution=octree_resolution, enable_pbar=enable_pbar) return grid_logits.movedim(-2, -1) - def encode(self, x): - return None + def encode(self, surface): + + pc, feats = surface[:, :, :3], surface[:, :, 3:] + latents = self.encoder(pc, feats) + + moments = self.pre_kl(latents) + posterior = DiagonalGaussianDistribution(moments, feature_dim = -1) + + latents = posterior.sample() + + return latents diff --git a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py new file mode 100644 index 000000000..48575bb3c --- /dev/null +++ b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py @@ -0,0 +1,658 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from comfy.ldm.modules.attention import optimized_attention + +class GELU(nn.Module): + + def __init__(self, dim_in: int, dim_out: int, operations, device, dtype): + super().__init__() + self.proj = operations.Linear(dim_in, dim_out, device = device, dtype = dtype) + + def gelu(self, gate: torch.Tensor) -> torch.Tensor: + + if gate.device.type == "mps": + return F.gelu(gate.to(dtype = torch.float32)).to(dtype = gate.dtype) + + return F.gelu(gate) + + def forward(self, hidden_states): + + hidden_states = self.proj(hidden_states) + hidden_states = self.gelu(hidden_states) + + return hidden_states + +class FeedForward(nn.Module): + + def __init__(self, dim: int, dim_out = None, mult: int = 4, + dropout: float = 0.0, inner_dim = None, operations = None, device = None, dtype = None): + + super().__init__() + if inner_dim is None: + inner_dim = int(dim * mult) + + dim_out = dim_out if dim_out is not None else dim + + act_fn = GELU(dim, inner_dim, operations = operations, device = device, dtype = dtype) + + self.net = nn.ModuleList([]) + self.net.append(act_fn) + + self.net.append(nn.Dropout(dropout)) + self.net.append(operations.Linear(inner_dim, dim_out, device = device, dtype = dtype)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for module in self.net: + hidden_states = module(hidden_states) + return hidden_states + +class AddAuxLoss(torch.autograd.Function): + + @staticmethod + def forward(ctx, x, loss): + # do nothing in forward (no computation) + ctx.requires_aux_loss = loss.requires_grad + ctx.dtype = loss.dtype + + return x + + @staticmethod + def backward(ctx, grad_output): + # add the aux loss gradients + grad_loss = None + # put the aux grad the same as the main grad loss + # aux grad contributes equally + if ctx.requires_aux_loss: + grad_loss = torch.ones(1, dtype = ctx.dtype, device = grad_output.device) + + return grad_output, grad_loss + +class MoEGate(nn.Module): + + def __init__(self, embed_dim, num_experts=16, num_experts_per_tok=2, aux_loss_alpha=0.01, device = None, dtype = None): + + super().__init__() + self.top_k = num_experts_per_tok + self.n_routed_experts = num_experts + + self.alpha = aux_loss_alpha + + self.gating_dim = embed_dim + self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim), device = device, dtype = dtype)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + + # flatten hidden states + hidden_states = hidden_states.view(-1, hidden_states.size(-1)) + + # get logits and pass it to softmax + logits = F.linear(hidden_states, self.weight, bias = None) + scores = logits.softmax(dim = -1) + + topk_weight, topk_idx = torch.topk(scores, k = self.top_k, dim = -1, sorted = False) + + if self.training and self.alpha > 0.0: + scores_for_aux = scores + + # used bincount instead of one hot encoding + counts = torch.bincount(topk_idx.view(-1), minlength = self.n_routed_experts).float() + ce = counts / topk_idx.numel() # normalized expert usage + + # mean expert score + Pi = scores_for_aux.mean(0) + + # expert balance loss + aux_loss = (Pi * ce * self.n_routed_experts).sum() * self.alpha + else: + aux_loss = None + + return topk_idx, topk_weight, aux_loss + +class MoEBlock(nn.Module): + def __init__(self, dim, num_experts: int = 6, moe_top_k: int = 2, dropout: float = 0.0, + ff_inner_dim: int = None, operations = None, device = None, dtype = None): + super().__init__() + + self.moe_top_k = moe_top_k + self.num_experts = num_experts + + self.experts = nn.ModuleList([ + FeedForward(dim, dropout = dropout, inner_dim = ff_inner_dim, operations = operations, device = device, dtype = dtype) + for _ in range(num_experts) + ]) + + self.gate = MoEGate(dim, num_experts = num_experts, num_experts_per_tok = moe_top_k, device = device, dtype = dtype) + self.shared_experts = FeedForward(dim, dropout = dropout, inner_dim = ff_inner_dim, operations = operations, device = device, dtype = dtype) + + def forward(self, hidden_states) -> torch.Tensor: + + identity = hidden_states + orig_shape = hidden_states.shape + topk_idx, topk_weight, aux_loss = self.gate(hidden_states) + + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + flat_topk_idx = topk_idx.view(-1) + + if self.training: + + hidden_states = hidden_states.repeat_interleave(self.moe_top_k, dim = 0) + y = torch.empty_like(hidden_states, dtype = hidden_states.dtype) + + for i, expert in enumerate(self.experts): + tmp = expert(hidden_states[flat_topk_idx == i]) + y[flat_topk_idx == i] = tmp.to(hidden_states.dtype) + + y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim = 1) + y = y.view(*orig_shape) + + y = AddAuxLoss.apply(y, aux_loss) + else: + y = self.moe_infer(hidden_states, flat_expert_indices = flat_topk_idx,flat_expert_weights = topk_weight.view(-1, 1)).view(*orig_shape) + + y = y + self.shared_experts(identity) + + return y + + @torch.no_grad() + def moe_infer(self, x, flat_expert_indices, flat_expert_weights): + + expert_cache = torch.zeros_like(x) + idxs = flat_expert_indices.argsort() + + # no need for .numpy().cpu() here + tokens_per_expert = flat_expert_indices.bincount().cumsum(0) + token_idxs = idxs // self.moe_top_k + + for i, end_idx in enumerate(tokens_per_expert): + + start_idx = 0 if i == 0 else tokens_per_expert[i-1] + + if start_idx == end_idx: + continue + + expert = self.experts[i] + exp_token_idx = token_idxs[start_idx:end_idx] + + expert_tokens = x[exp_token_idx] + expert_out = expert(expert_tokens) + + expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]]) + + # use index_add_ with a 1-D index tensor directly avoids building a large [N, D] index map and extra memcopy required by scatter_reduce_ + # + avoid dtype conversion + expert_cache.index_add_(0, exp_token_idx, expert_out) + + return expert_cache + +class Timesteps(nn.Module): + def __init__(self, num_channels: int, downscale_freq_shift: float = 0.0, + scale: float = 1.0, max_period: int = 10000): + super().__init__() + + self.num_channels = num_channels + half_dim = num_channels // 2 + + # precompute the “inv_freq” vector once + exponent = -math.log(max_period) * torch.arange( + half_dim, dtype=torch.float32 + ) / (half_dim - downscale_freq_shift) + + inv_freq = torch.exp(exponent) + + # pad + if num_channels % 2 == 1: + # we’ll pad a zero at the end of the cos-half + inv_freq = torch.cat([inv_freq, inv_freq.new_zeros(1)]) + + # register to buffer so it moves with the device + self.register_buffer("inv_freq", inv_freq, persistent = False) + self.scale = scale + + def forward(self, timesteps: torch.Tensor): + + x = timesteps.float().unsqueeze(1) * self.inv_freq.to(timesteps.device).unsqueeze(0) + + + # fused CUDA kernels for sin and cos + sin_emb = x.sin() + cos_emb = x.cos() + + emb = torch.cat([sin_emb, cos_emb], dim = 1) + + # scale factor + if self.scale != 1.0: + emb = emb * self.scale + + # If we padded inv_freq for odd, emb is already wide enough; otherwise: + if emb.shape[1] > self.num_channels: + emb = emb[:, :self.num_channels] + + return emb + +class TimestepEmbedder(nn.Module): + def __init__(self, hidden_size, frequency_embedding_size = 256, cond_proj_dim = None, operations = None, device = None, dtype = None): + super().__init__() + + self.mlp = nn.Sequential( + operations.Linear(hidden_size, frequency_embedding_size, bias=True, device = device, dtype = dtype), + nn.GELU(), + operations.Linear(frequency_embedding_size, hidden_size, bias=True, device = device, dtype = dtype), + ) + self.frequency_embedding_size = frequency_embedding_size + + if cond_proj_dim is not None: + self.cond_proj = operations.Linear(cond_proj_dim, frequency_embedding_size, bias=False, device = device, dtype = dtype) + + self.time_embed = Timesteps(hidden_size) + + def forward(self, timesteps, condition): + + timestep_embed = self.time_embed(timesteps).type(self.mlp[0].weight.dtype) + + if condition is not None: + cond_embed = self.cond_proj(condition) + timestep_embed = timestep_embed + cond_embed + + time_conditioned = self.mlp(timestep_embed.to(self.mlp[0].weight.device)) + + # for broadcasting with image tokens + return time_conditioned.unsqueeze(1) + +class MLP(nn.Module): + def __init__(self, *, width: int, operations = None, device = None, dtype = None): + super().__init__() + self.width = width + self.fc1 = operations.Linear(width, width * 4, device = device, dtype = dtype) + self.fc2 = operations.Linear(width * 4, width, device = device, dtype = dtype) + self.gelu = nn.GELU() + + def forward(self, x): + return self.fc2(self.gelu(self.fc1(x))) + +class CrossAttention(nn.Module): + def __init__( + self, + qdim, + kdim, + num_heads, + qkv_bias=True, + qk_norm=False, + norm_layer=nn.LayerNorm, + use_fp16: bool = False, + operations = None, + dtype = None, + device = None, + **kwargs, + ): + super().__init__() + self.qdim = qdim + self.kdim = kdim + + self.num_heads = num_heads + self.head_dim = self.qdim // num_heads + + self.scale = self.head_dim ** -0.5 + + self.to_q = operations.Linear(qdim, qdim, bias=qkv_bias, device = device, dtype = dtype) + self.to_k = operations.Linear(kdim, qdim, bias=qkv_bias, device = device, dtype = dtype) + self.to_v = operations.Linear(kdim, qdim, bias=qkv_bias, device = device, dtype = dtype) + + if use_fp16: + eps = 1.0 / 65504 + else: + eps = 1e-6 + + if norm_layer == nn.LayerNorm: + norm_layer = operations.LayerNorm + else: + norm_layer = operations.RMSNorm + + self.q_norm = norm_layer(self.head_dim, elementwise_affine=True, eps = eps, device = device, dtype = dtype) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim, elementwise_affine=True, eps = eps, device = device, dtype = dtype) if qk_norm else nn.Identity() + self.out_proj = operations.Linear(qdim, qdim, bias=True, device = device, dtype = dtype) + + def forward(self, x, y): + + b, s1, _ = x.shape + _, s2, _ = y.shape + + y = y.to(next(self.to_k.parameters()).dtype) + + q = self.to_q(x) + k = self.to_k(y) + v = self.to_v(y) + + kv = torch.cat((k, v), dim=-1) + split_size = kv.shape[-1] // self.num_heads // 2 + + kv = kv.view(1, -1, self.num_heads, split_size * 2) + k, v = torch.split(kv, split_size, dim=-1) + + q = q.view(b, s1, self.num_heads, self.head_dim) + k = k.view(b, s2, self.num_heads, self.head_dim) + v = v.reshape(b, s2, self.num_heads * self.head_dim) + + q = self.q_norm(q) + k = self.k_norm(k) + + x = optimized_attention( + q.reshape(b, s1, self.num_heads * self.head_dim), + k.reshape(b, s2, self.num_heads * self.head_dim), + v, + heads=self.num_heads, + ) + + out = self.out_proj(x) + + return out + +class Attention(nn.Module): + + def __init__( + self, + dim, + num_heads, + qkv_bias = True, + qk_norm = False, + norm_layer = nn.LayerNorm, + use_fp16: bool = False, + operations = None, + device = None, + dtype = None + ): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.head_dim = self.dim // num_heads + self.scale = self.head_dim ** -0.5 + + self.to_q = operations.Linear(dim, dim, bias = qkv_bias, device = device, dtype = dtype) + self.to_k = operations.Linear(dim, dim, bias = qkv_bias, device = device, dtype = dtype) + self.to_v = operations.Linear(dim, dim, bias = qkv_bias, device = device, dtype = dtype) + + if use_fp16: + eps = 1.0 / 65504 + else: + eps = 1e-6 + + if norm_layer == nn.LayerNorm: + norm_layer = operations.LayerNorm + else: + norm_layer = operations.RMSNorm + + self.q_norm = norm_layer(self.head_dim, elementwise_affine=True, eps = eps, device = device, dtype = dtype) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim, elementwise_affine=True, eps = eps, device = device, dtype = dtype) if qk_norm else nn.Identity() + self.out_proj = operations.Linear(dim, dim, device = device, dtype = dtype) + + def forward(self, x): + B, N, _ = x.shape + + query = self.to_q(x) + key = self.to_k(x) + value = self.to_v(x) + + qkv_combined = torch.cat((query, key, value), dim=-1) + split_size = qkv_combined.shape[-1] // self.num_heads // 3 + + qkv = qkv_combined.view(1, -1, self.num_heads, split_size * 3) + query, key, value = torch.split(qkv, split_size, dim=-1) + + query = query.reshape(B, N, self.num_heads, self.head_dim) + key = key.reshape(B, N, self.num_heads, self.head_dim) + value = value.reshape(B, N, self.num_heads * self.head_dim) + + query = self.q_norm(query) + key = self.k_norm(key) + + x = optimized_attention( + query.reshape(B, N, self.num_heads * self.head_dim), + key.reshape(B, N, self.num_heads * self.head_dim), + value, + heads=self.num_heads, + ) + + x = self.out_proj(x) + return x + +class HunYuanDiTBlock(nn.Module): + def __init__( + self, + hidden_size, + c_emb_size, + num_heads, + text_states_dim=1024, + qk_norm=False, + norm_layer=nn.LayerNorm, + qk_norm_layer=nn.RMSNorm, + qkv_bias=True, + skip_connection=True, + timested_modulate=False, + use_moe: bool = False, + num_experts: int = 8, + moe_top_k: int = 2, + use_fp16: bool = False, + operations = None, + device = None, dtype = None + ): + super().__init__() + + # eps can't be 1e-6 in fp16 mode because of numerical stability issues + if use_fp16: + eps = 1.0 / 65504 + else: + eps = 1e-6 + + self.norm1 = norm_layer(hidden_size, elementwise_affine = True, eps = eps, device = device, dtype = dtype) + + self.attn1 = Attention(hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, qk_norm=qk_norm, + norm_layer=qk_norm_layer, use_fp16 = use_fp16, device = device, dtype = dtype, operations = operations) + + self.norm2 = norm_layer(hidden_size, elementwise_affine = True, eps = eps, device = device, dtype = dtype) + + self.timested_modulate = timested_modulate + if self.timested_modulate: + self.default_modulation = nn.Sequential( + nn.SiLU(), + operations.Linear(c_emb_size, hidden_size, bias=True, device = device, dtype = dtype) + ) + + self.attn2 = CrossAttention(hidden_size, text_states_dim, num_heads=num_heads, qkv_bias=qkv_bias, + qk_norm=qk_norm, norm_layer=qk_norm_layer, use_fp16 = use_fp16, + device = device, dtype = dtype, operations = operations) + + self.norm3 = norm_layer(hidden_size, elementwise_affine = True, eps = eps, device = device, dtype = dtype) + + if skip_connection: + self.skip_norm = norm_layer(hidden_size, elementwise_affine = True, eps = eps, device = device, dtype = dtype) + self.skip_linear = operations.Linear(2 * hidden_size, hidden_size, device = device, dtype = dtype) + else: + self.skip_linear = None + + self.use_moe = use_moe + + if self.use_moe: + self.moe = MoEBlock( + hidden_size, + num_experts = num_experts, + moe_top_k = moe_top_k, + dropout = 0.0, + ff_inner_dim = int(hidden_size * 4.0), + device = device, dtype = dtype, + operations = operations + ) + else: + self.mlp = MLP(width=hidden_size, operations=operations, device = device, dtype = dtype) + + def forward(self, hidden_states, conditioning=None, text_states=None, skip_tensor=None): + + if self.skip_linear is not None: + combined = torch.cat([skip_tensor, hidden_states], dim=-1) + hidden_states = self.skip_linear(combined) + hidden_states = self.skip_norm(hidden_states) + + # self attention + if self.timested_modulate: + modulation_shift = self.default_modulation(conditioning).unsqueeze(dim=1) + hidden_states = hidden_states + modulation_shift + + self_attn_out = self.attn1(self.norm1(hidden_states)) + hidden_states = hidden_states + self_attn_out + + # cross attention + hidden_states = hidden_states + self.attn2(self.norm2(hidden_states), text_states) + + # MLP Layer + mlp_input = self.norm3(hidden_states) + + if self.use_moe: + hidden_states = hidden_states + self.moe(mlp_input) + else: + hidden_states = hidden_states + self.mlp(mlp_input) + + return hidden_states + +class FinalLayer(nn.Module): + + def __init__(self, final_hidden_size, out_channels, operations, use_fp16: bool = False, device = None, dtype = None): + super().__init__() + + if use_fp16: + eps = 1.0 / 65504 + else: + eps = 1e-6 + + self.norm_final = operations.LayerNorm(final_hidden_size, elementwise_affine = True, eps = eps, device = device, dtype = dtype) + self.linear = operations.Linear(final_hidden_size, out_channels, bias = True, device = device, dtype = dtype) + + def forward(self, x): + x = self.norm_final(x) + x = x[:, 1:] + x = self.linear(x) + return x + +class HunYuanDiTPlain(nn.Module): + + # init with the defaults values from https://huggingface.co/tencent/Hunyuan3D-2.1/blob/main/hunyuan3d-dit-v2-1/config.yaml + def __init__( + self, + in_channels: int = 64, + hidden_size: int = 2048, + context_dim: int = 1024, + depth: int = 21, + num_heads: int = 16, + qk_norm: bool = True, + qkv_bias: bool = False, + num_moe_layers: int = 6, + guidance_cond_proj_dim = 2048, + norm_type = 'layer', + num_experts: int = 8, + moe_top_k: int = 2, + use_fp16: bool = False, + dtype = None, + device = None, + operations = None, + **kwargs + ): + + self.dtype = dtype + + super().__init__() + + self.depth = depth + + self.in_channels = in_channels + self.out_channels = in_channels + + self.num_heads = num_heads + self.hidden_size = hidden_size + + norm = operations.LayerNorm if norm_type == 'layer' else operations.RMSNorm + qk_norm = operations.RMSNorm + + self.context_dim = context_dim + self.guidance_cond_proj_dim = guidance_cond_proj_dim + + self.x_embedder = operations.Linear(in_channels, hidden_size, bias = True, device = device, dtype = dtype) + self.t_embedder = TimestepEmbedder(hidden_size, hidden_size * 4, cond_proj_dim = guidance_cond_proj_dim, device = device, dtype = dtype, operations = operations) + + + # HUnYuanDiT Blocks + self.blocks = nn.ModuleList([ + HunYuanDiTBlock(hidden_size=hidden_size, + c_emb_size=hidden_size, + num_heads=num_heads, + text_states_dim=context_dim, + qk_norm=qk_norm, + norm_layer = norm, + qk_norm_layer = qk_norm, + skip_connection=layer > depth // 2, + qkv_bias=qkv_bias, + use_moe=True if depth - layer <= num_moe_layers else False, + num_experts=num_experts, + moe_top_k=moe_top_k, + use_fp16 = use_fp16, + device = device, dtype = dtype, operations = operations) + for layer in range(depth) + ]) + + self.depth = depth + + self.final_layer = FinalLayer(hidden_size, self.out_channels, use_fp16 = use_fp16, operations = operations, device = device, dtype = dtype) + + def forward(self, x, t, context, transformer_options = {}, **kwargs): + + x = x.movedim(-1, -2) + uncond_emb, cond_emb = context.chunk(2, dim = 0) + + context = torch.cat([cond_emb, uncond_emb], dim = 0) + main_condition = context + + t = 1.0 - t + + time_embedded = self.t_embedder(t, condition = kwargs.get('guidance_cond')) + + x = x.to(dtype = next(self.x_embedder.parameters()).dtype) + x_embedded = self.x_embedder(x) + + combined = torch.cat([time_embedded, x_embedded], dim=1) + + def block_wrap(args): + return block( + args["x"], + args["t"], + args["cond"], + skip_tensor=args.get("skip"),) + + skip_stack = [] + patches_replace = transformer_options.get("patches_replace", {}) + blocks_replace = patches_replace.get("dit", {}) + for idx, block in enumerate(self.blocks): + if idx <= self.depth // 2: + skip_input = None + else: + skip_input = skip_stack.pop() + + if ("block", idx) in blocks_replace: + + combined = blocks_replace[("block", idx)]( + { + "x": combined, + "t": time_embedded, + "cond": main_condition, + "skip": skip_input, + }, + {"original_block": block_wrap}, + ) + else: + combined = block(combined, time_embedded, main_condition, skip_tensor=skip_input) + + if idx < self.depth // 2: + skip_stack.append(combined) + + output = self.final_layer(combined) + output = output.movedim(-2, -1) * (-1.0) + + cond_emb, uncond_emb = output.chunk(2, dim = 0) + return torch.cat([uncond_emb, cond_emb]) diff --git a/comfy/model_base.py b/comfy/model_base.py index 56a6798be..39a3344bc 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -16,6 +16,8 @@ along with this program. If not, see . """ +import comfy.ldm.hunyuan3dv2_1 +import comfy.ldm.hunyuan3dv2_1.hunyuandit import torch import logging from comfy.ldm.modules.diffusionmodules.openaimodel import UNetModel, Timestep @@ -1282,6 +1284,21 @@ class Hunyuan3Dv2(BaseModel): out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance])) return out +class Hunyuan3Dv2_1(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan3dv2_1.hunyuandit.HunYuanDiTPlain) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + + guidance = kwargs.get("guidance", 5.0) + if guidance is not None: + out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance])) + return out + class HiDream(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hidream.model.HiDreamImageTransformer2DModel) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 9f3ab64df..75552ede9 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -400,6 +400,20 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys return dit_config + if f"{key_prefix}t_embedder.mlp.2.weight" in state_dict_keys: # Hunyuan 3D 2.1 + + dit_config = {} + dit_config["image_model"] = "hunyuan3d2_1" + dit_config["in_channels"] = state_dict[f"{key_prefix}x_embedder.weight"].shape[1] + dit_config["context_dim"] = 1024 + dit_config["hidden_size"] = state_dict[f"{key_prefix}x_embedder.weight"].shape[0] + dit_config["mlp_ratio"] = 4.0 + dit_config["num_heads"] = 16 + dit_config["depth"] = count_blocks(state_dict_keys, f"{key_prefix}blocks.{{}}") + dit_config["qkv_bias"] = False + dit_config["guidance_cond_proj_dim"] = None#f"{key_prefix}t_embedder.cond_proj.weight" in state_dict_keys + return dit_config + if '{}caption_projection.0.linear.weight'.format(key_prefix) in state_dict_keys: # HiDream dit_config = {} dit_config["image_model"] = "hidream" diff --git a/comfy/sd.py b/comfy/sd.py index bb5d61fb3..014f797ca 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -446,17 +446,29 @@ class VAE: self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype) self.memory_used_decode = lambda shape, dtype: 7000 * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype) + # Hunyuan 3d v2 2.0 & 2.1 elif "geo_decoder.cross_attn_decoder.ln_1.bias" in sd: + self.latent_dim = 1 - ln_post = "geo_decoder.ln_post.weight" in sd - inner_size = sd["geo_decoder.output_proj.weight"].shape[1] - downsample_ratio = sd["post_kl.weight"].shape[0] // inner_size - mlp_expand = sd["geo_decoder.cross_attn_decoder.mlp.c_fc.weight"].shape[0] // inner_size - self.memory_used_encode = lambda shape, dtype: (1000 * shape[2]) * model_management.dtype_size(dtype) # TODO - self.memory_used_decode = lambda shape, dtype: (1024 * 1024 * 1024 * 2.0) * model_management.dtype_size(dtype) # TODO - ddconfig = {"embed_dim": 64, "num_freqs": 8, "include_pi": False, "heads": 16, "width": 1024, "num_decoder_layers": 16, "qkv_bias": False, "qk_norm": True, "geo_decoder_mlp_expand_ratio": mlp_expand, "geo_decoder_downsample_ratio": downsample_ratio, "geo_decoder_ln_post": ln_post} - self.first_stage_model = comfy.ldm.hunyuan3d.vae.ShapeVAE(**ddconfig) + + def estimate_memory(shape, dtype, num_layers = 16, kv_cache_multiplier = 2): + batch, num_tokens, hidden_dim = shape + dtype_size = model_management.dtype_size(dtype) + + total_mem = batch * num_tokens * hidden_dim * dtype_size * (1 + kv_cache_multiplier * num_layers) + return total_mem + + # better memory estimations + self.memory_used_encode = lambda shape, dtype, num_layers = 8, kv_cache_multiplier = 0:\ + estimate_memory(shape, dtype, num_layers, kv_cache_multiplier) + + self.memory_used_decode = lambda shape, dtype, num_layers = 16, kv_cache_multiplier = 2: \ + estimate_memory(shape, dtype, num_layers, kv_cache_multiplier) + + self.first_stage_model = comfy.ldm.hunyuan3d.vae.ShapeVAE() self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] + + elif "vocoder.backbone.channel_layers.0.0.bias" in sd: #Ace Step Audio self.first_stage_model = comfy.ldm.ace.vae.music_dcae_pipeline.MusicDCAE(source_sample_rate=44100) self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype) @@ -1046,6 +1058,27 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c model = None model_patcher = None + if isinstance(sd, dict) and all(k in sd for k in ["model", "vae", "conditioner"]): + from collections import OrderedDict + import gc + + merged_sd = OrderedDict() + + for k, v in sd["model"].items(): + merged_sd[f"model.{k}"] = v + + for k, v in sd["vae"].items(): + merged_sd[f"vae.{k}"] = v + + for key, value in sd["conditioner"].items(): + merged_sd[f"conditioner.{key}"] = value + + sd = merged_sd + + del merged_sd + gc.collect() + torch.cuda.empty_cache() + diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd) parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix) weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 76260de00..75dad277d 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1128,6 +1128,17 @@ class Hunyuan3Dv2(supported_models_base.BASE): def clip_target(self, state_dict={}): return None +class Hunyuan3Dv2_1(Hunyuan3Dv2): + unet_config = { + "image_model": "hunyuan3d2_1", + } + + latent_format = latent_formats.Hunyuan3Dv2_1 + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.Hunyuan3Dv2_1(self, device = device) + return out + class Hunyuan3Dv2mini(Hunyuan3Dv2): unet_config = { "image_model": "hunyuan3d2", @@ -1285,6 +1296,6 @@ class QwenImage(supported_models_base.BASE): return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_hunyuan3d.py b/comfy_extras/nodes_hunyuan3d.py index 51e45336a..f6e71e0a8 100644 --- a/comfy_extras/nodes_hunyuan3d.py +++ b/comfy_extras/nodes_hunyuan3d.py @@ -8,13 +8,16 @@ import folder_paths import comfy.model_management from comfy.cli_args import args - class EmptyLatentHunyuan3Dv2: @classmethod def INPUT_TYPES(s): - return {"required": {"resolution": ("INT", {"default": 3072, "min": 1, "max": 8192}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent images in the batch."}), - }} + return { + "required": { + "resolution": ("INT", {"default": 3072, "min": 1, "max": 8192}), + "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent images in the batch."}), + } + } + RETURN_TYPES = ("LATENT",) FUNCTION = "generate" @@ -24,7 +27,6 @@ class EmptyLatentHunyuan3Dv2: latent = torch.zeros([batch_size, 64, resolution], device=comfy.model_management.intermediate_device()) return ({"samples": latent, "type": "hunyuan3dv2"}, ) - class Hunyuan3Dv2Conditioning: @classmethod def INPUT_TYPES(s): @@ -81,7 +83,6 @@ class VOXEL: def __init__(self, data): self.data = data - class VAEDecodeHunyuan3D: @classmethod def INPUT_TYPES(s): @@ -99,7 +100,6 @@ class VAEDecodeHunyuan3D: voxels = VOXEL(vae.decode(samples["samples"], vae_options={"num_chunks": num_chunks, "octree_resolution": octree_resolution})) return (voxels, ) - def voxel_to_mesh(voxels, threshold=0.5, device=None): if device is None: device = torch.device("cpu") @@ -230,13 +230,9 @@ def voxel_to_mesh_surfnet(voxels, threshold=0.5, device=None): [0, 0, 1], [1, 0, 1], [0, 1, 1], [1, 1, 1] ], device=device) - corner_values = torch.zeros((cell_positions.shape[0], 8), device=device) - for c, (dz, dy, dx) in enumerate(corner_offsets): - corner_values[:, c] = padded[ - cell_positions[:, 0] + dz, - cell_positions[:, 1] + dy, - cell_positions[:, 2] + dx - ] + pos = cell_positions.unsqueeze(1) + corner_offsets.unsqueeze(0) + z_idx, y_idx, x_idx = pos.unbind(-1) + corner_values = padded[z_idx, y_idx, x_idx] corner_signs = corner_values > threshold has_inside = torch.any(corner_signs, dim=1) diff --git a/nodes.py b/nodes.py index 6c2f9dd14..1afe5601a 100644 --- a/nodes.py +++ b/nodes.py @@ -998,20 +998,31 @@ class CLIPVisionLoader: class CLIPVisionEncode: @classmethod def INPUT_TYPES(s): - return {"required": { "clip_vision": ("CLIP_VISION",), - "image": ("IMAGE",), - "crop": (["center", "none"],) - }} + return { + "required": { + "clip_vision": ("CLIP_VISION",), + "image": ("IMAGE",), + "crop": (["center", "none", "recenter"],), + }, + "optional": { + "border_ratio": ("FLOAT", {"default": 0.15, "min": 0.0, "max": 0.5, "step": 0.01, "visible_if": {"crop": "recenter"},}), + } + } + RETURN_TYPES = ("CLIP_VISION_OUTPUT",) FUNCTION = "encode" CATEGORY = "conditioning" - def encode(self, clip_vision, image, crop): - crop_image = True - if crop != "center": - crop_image = False - output = clip_vision.encode_image(image, crop=crop_image) + def encode(self, clip_vision, image, crop, border_ratio): + crop_image = crop == "center" + + if crop == "recenter": + crop_image = True + else: + border_ratio = None + + output = clip_vision.encode_image(image, crop=crop_image, border_ratio = border_ratio) return (output,) class StyleModelLoader: diff --git a/requirements.txt b/requirements.txt index 3008a5dc3..564fa6e23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,4 +27,4 @@ kornia>=0.7.1 spandrel soundfile pydantic~=2.0 -pydantic-settings~=2.0 +pydantic-settings~=2.0 \ No newline at end of file From c9ebe70072213a875ffbe40cc1b36820b2005211 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 4 Sep 2025 17:39:02 -0700 Subject: [PATCH 241/759] Some changes to the previous hunyuan PR. (#9725) --- comfy/clip_vision.py | 225 +------------------------------------------ comfy/sd.py | 21 ---- nodes.py | 29 ++---- requirements.txt | 2 +- 4 files changed, 14 insertions(+), 263 deletions(-) diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index 4bc640e8b..447b1ce4a 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -17,227 +17,10 @@ class Output: def __setitem__(self, key, item): setattr(self, key, item) - -def cubic_kernel(x, a: float = -0.75): - absx = x.abs() - absx2 = absx ** 2 - absx3 = absx ** 3 - - w = (a + 2) * absx3 - (a + 3) * absx2 + 1 - w2 = a * absx3 - 5*a * absx2 + 8*a * absx - 4*a - - return torch.where(absx <= 1, w, torch.where(absx < 2, w2, torch.zeros_like(x))) - -def get_indices_weights(in_size, out_size, scale): - # OpenCV-style half-pixel mapping - x = torch.arange(out_size, dtype=torch.float32) - x = (x + 0.5) / scale - 0.5 - - x0 = x.floor().long() - dx = x.unsqueeze(1) - (x0.unsqueeze(1) + torch.arange(-1, 3)) - - weights = cubic_kernel(dx) - weights = weights / weights.sum(dim=1, keepdim=True) - - indices = x0.unsqueeze(1) + torch.arange(-1, 3) - indices = indices.clamp(0, in_size - 1) - - return indices, weights - -def resize_cubic_1d(x, out_size, dim): - b, c, h, w = x.shape - in_size = h if dim == 2 else w - scale = out_size / in_size - - indices, weights = get_indices_weights(in_size, out_size, scale) - - if dim == 2: - x = x.permute(0, 1, 3, 2) - x = x.reshape(-1, h) - else: - x = x.reshape(-1, w) - - gathered = x[:, indices] - out = (gathered * weights.unsqueeze(0)).sum(dim=2) - - if dim == 2: - out = out.reshape(b, c, w, out_size).permute(0, 1, 3, 2) - else: - out = out.reshape(b, c, h, out_size) - - return out - -def resize_cubic(img: torch.Tensor, size: tuple) -> torch.Tensor: - """ - Resize image using OpenCV-equivalent INTER_CUBIC interpolation. - Implemented in pure PyTorch - """ - - if img.ndim == 3: - img = img.unsqueeze(0) - - img = img.permute(0, 3, 1, 2) - - out_h, out_w = size - img = resize_cubic_1d(img, out_h, dim=2) - img = resize_cubic_1d(img, out_w, dim=3) - return img - -def resize_area(img: torch.Tensor, size: tuple) -> torch.Tensor: - # vectorized implementation for OpenCV's INTER_AREA using pure PyTorch - original_shape = img.shape - is_hwc = False - - if img.ndim == 3: - if img.shape[0] <= 4: - img = img.unsqueeze(0) - else: - is_hwc = True - img = img.permute(2, 0, 1).unsqueeze(0) - elif img.ndim == 4: - pass - else: - raise ValueError("Expected image with 3 or 4 dims.") - - B, C, H, W = img.shape - out_h, out_w = size - scale_y = H / out_h - scale_x = W / out_w - - device = img.device - - # compute the grid boundries - y_start = torch.arange(out_h, device=device).float() * scale_y - y_end = y_start + scale_y - x_start = torch.arange(out_w, device=device).float() * scale_x - x_end = x_start + scale_x - - # for each output pixel, we will compute the range for it - y_start_int = torch.floor(y_start).long() - y_end_int = torch.ceil(y_end).long() - x_start_int = torch.floor(x_start).long() - x_end_int = torch.ceil(x_end).long() - - # We will build the weighted sums by iterating over contributing input pixels once - output = torch.zeros((B, C, out_h, out_w), dtype=torch.float32, device=device) - area = torch.zeros((out_h, out_w), dtype=torch.float32, device=device) - - max_kernel_h = int(torch.max(y_end_int - y_start_int).item()) - max_kernel_w = int(torch.max(x_end_int - x_start_int).item()) - - for dy in range(max_kernel_h): - for dx in range(max_kernel_w): - # compute the weights for this offset for all output pixels - - y_idx = y_start_int.unsqueeze(1) + dy - x_idx = x_start_int.unsqueeze(0) + dx - - # clamp indices to image boundaries - y_idx_clamped = torch.clamp(y_idx, 0, H - 1) - x_idx_clamped = torch.clamp(x_idx, 0, W - 1) - - # compute weights by broadcasting - y_weight = (torch.min(y_end.unsqueeze(1), y_idx_clamped.float() + 1.0) - torch.max(y_start.unsqueeze(1), y_idx_clamped.float())).clamp(min=0) - x_weight = (torch.min(x_end.unsqueeze(0), x_idx_clamped.float() + 1.0) - torch.max(x_start.unsqueeze(0), x_idx_clamped.float())).clamp(min=0) - - weight = (y_weight * x_weight) - - y_expand = y_idx_clamped.expand(out_h, out_w) - x_expand = x_idx_clamped.expand(out_h, out_w) - - - pixels = img[:, :, y_expand, x_expand] - - # unsqueeze to broadcast - w = weight.unsqueeze(0).unsqueeze(0) - - output += pixels * w - area += weight - - # Normalize by area - output /= area.unsqueeze(0).unsqueeze(0) - - if is_hwc: - return output[0].permute(1, 2, 0) - elif img.shape[0] == 1 and original_shape[0] <= 4: - return output[0] - else: - return output - -def recenter(image, border_ratio: float = 0.2): - - if image.shape[-1] == 4: - mask = image[..., 3] - else: - mask = torch.ones_like(image[..., 0:1]) * 255 - image = torch.concatenate([image, mask], axis=-1) - mask = mask[..., 0] - - H, W, C = image.shape - - size = max(H, W) - result = torch.zeros((size, size, C), dtype = torch.uint8) - - # as_tuple to match numpy behaviour - x_coords, y_coords = torch.nonzero(mask, as_tuple=True) - - y_min, y_max = y_coords.min(), y_coords.max() - x_min, x_max = x_coords.min(), x_coords.max() - - h = x_max - x_min - w = y_max - y_min - - if h == 0 or w == 0: - raise ValueError('input image is empty') - - desired_size = int(size * (1 - border_ratio)) - scale = desired_size / max(h, w) - - h2 = int(h * scale) - w2 = int(w * scale) - - x2_min = (size - h2) // 2 - x2_max = x2_min + h2 - - y2_min = (size - w2) // 2 - y2_max = y2_min + w2 - - # note: opencv takes columns first (opposite to pytorch and numpy that take the row first) - result[x2_min:x2_max, y2_min:y2_max] = resize_area(image[x_min:x_max, y_min:y_max], (h2, w2)) - - bg = torch.ones((result.shape[0], result.shape[1], 3), dtype = torch.uint8) * 255 - - mask = result[..., 3:].to(torch.float32) / 255 - result = result[..., :3] * mask + bg * (1 - mask) - - mask = mask * 255 - result = result.clip(0, 255).to(torch.uint8) - mask = mask.clip(0, 255).to(torch.uint8) - - return result - -def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], - crop=True, value_range = (-1, 1), border_ratio: float = None, recenter_size: int = 512): - - if border_ratio is not None: - - image = (image * 255).clamp(0, 255).to(torch.uint8) - image = [recenter(img, border_ratio = border_ratio) for img in image] - - image = torch.stack(image, dim = 0) - image = resize_cubic(image, size = (recenter_size, recenter_size)) - - image = image / 255 * 2 - 1 - low, high = value_range - - image = (image - low) / (high - low) - image = image.permute(0, 2, 3, 1) - +def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True): image = image[:, :, :, :3] if image.shape[3] > 3 else image - mean = torch.tensor(mean, device=image.device, dtype=image.dtype) std = torch.tensor(std, device=image.device, dtype=image.dtype) - image = image.movedim(-1, 1) if not (image.shape[2] == size and image.shape[3] == size): if crop: @@ -246,7 +29,7 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s else: scale_size = (size, size) - image = torch.nn.functional.interpolate(image, size=scale_size, mode="bilinear" if border_ratio is not None else "bicubic", antialias=True) + image = torch.nn.functional.interpolate(image, size=scale_size, mode="bicubic", antialias=True) h = (image.shape[2] - size)//2 w = (image.shape[3] - size)//2 image = image[:,:,h:h+size,w:w+size] @@ -288,9 +71,9 @@ class ClipVisionModel(): def get_sd(self): return self.model.state_dict() - def encode_image(self, image, crop=True, border_ratio: float = None): + def encode_image(self, image, crop=True): comfy.model_management.load_model_gpu(self.patcher) - pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop, border_ratio=border_ratio).float() + pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float() out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2) outputs = Output() diff --git a/comfy/sd.py b/comfy/sd.py index 014f797ca..be5aa8dc8 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1058,27 +1058,6 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c model = None model_patcher = None - if isinstance(sd, dict) and all(k in sd for k in ["model", "vae", "conditioner"]): - from collections import OrderedDict - import gc - - merged_sd = OrderedDict() - - for k, v in sd["model"].items(): - merged_sd[f"model.{k}"] = v - - for k, v in sd["vae"].items(): - merged_sd[f"vae.{k}"] = v - - for key, value in sd["conditioner"].items(): - merged_sd[f"conditioner.{key}"] = value - - sd = merged_sd - - del merged_sd - gc.collect() - torch.cuda.empty_cache() - diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd) parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix) weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix) diff --git a/nodes.py b/nodes.py index 1afe5601a..6c2f9dd14 100644 --- a/nodes.py +++ b/nodes.py @@ -998,31 +998,20 @@ class CLIPVisionLoader: class CLIPVisionEncode: @classmethod def INPUT_TYPES(s): - return { - "required": { - "clip_vision": ("CLIP_VISION",), - "image": ("IMAGE",), - "crop": (["center", "none", "recenter"],), - }, - "optional": { - "border_ratio": ("FLOAT", {"default": 0.15, "min": 0.0, "max": 0.5, "step": 0.01, "visible_if": {"crop": "recenter"},}), - } - } - + return {"required": { "clip_vision": ("CLIP_VISION",), + "image": ("IMAGE",), + "crop": (["center", "none"],) + }} RETURN_TYPES = ("CLIP_VISION_OUTPUT",) FUNCTION = "encode" CATEGORY = "conditioning" - def encode(self, clip_vision, image, crop, border_ratio): - crop_image = crop == "center" - - if crop == "recenter": - crop_image = True - else: - border_ratio = None - - output = clip_vision.encode_image(image, crop=crop_image, border_ratio = border_ratio) + def encode(self, clip_vision, image, crop): + crop_image = True + if crop != "center": + crop_image = False + output = clip_vision.encode_image(image, crop=crop_image) return (output,) class StyleModelLoader: diff --git a/requirements.txt b/requirements.txt index 564fa6e23..3008a5dc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,4 +27,4 @@ kornia>=0.7.1 spandrel soundfile pydantic~=2.0 -pydantic-settings~=2.0 \ No newline at end of file +pydantic-settings~=2.0 From 3493b9cb1f9a9a66b1b86ed908cf87bc382b647a Mon Sep 17 00:00:00 2001 From: Arjan Singh <1598641+arjansingh@users.noreply.github.com> Date: Fri, 5 Sep 2025 11:32:25 -0700 Subject: [PATCH 242/759] fix: add cache headers for images (#9560) --- middleware/__init__.py | 1 + middleware/cache_middleware.py | 52 ++++ server.py | 11 +- tests-unit/server_test/test_cache_control.py | 255 +++++++++++++++++++ 4 files changed, 311 insertions(+), 8 deletions(-) create mode 100644 middleware/__init__.py create mode 100644 middleware/cache_middleware.py create mode 100644 tests-unit/server_test/test_cache_control.py diff --git a/middleware/__init__.py b/middleware/__init__.py new file mode 100644 index 000000000..2d7c7c3a9 --- /dev/null +++ b/middleware/__init__.py @@ -0,0 +1 @@ +"""Server middleware modules""" diff --git a/middleware/cache_middleware.py b/middleware/cache_middleware.py new file mode 100644 index 000000000..374ef7934 --- /dev/null +++ b/middleware/cache_middleware.py @@ -0,0 +1,52 @@ +"""Cache control middleware for ComfyUI server""" + +from aiohttp import web +from typing import Callable, Awaitable + +# Time in seconds +ONE_HOUR: int = 3600 +ONE_DAY: int = 86400 +IMG_EXTENSIONS = ( + ".jpg", + ".jpeg", + ".png", + ".ppm", + ".bmp", + ".pgm", + ".tif", + ".tiff", + ".webp", +) + + +@web.middleware +async def cache_control( + request: web.Request, handler: Callable[[web.Request], Awaitable[web.Response]] +) -> web.Response: + """Cache control middleware that sets appropriate cache headers based on file type and response status""" + response: web.Response = await handler(request) + + if ( + request.path.endswith(".js") + or request.path.endswith(".css") + or request.path.endswith("index.json") + ): + response.headers.setdefault("Cache-Control", "no-cache") + return response + + # Early return for non-image files - no cache headers needed + if not request.path.lower().endswith(IMG_EXTENSIONS): + return response + + # Handle image files + if response.status == 404: + response.headers.setdefault("Cache-Control", f"public, max-age={ONE_HOUR}") + elif response.status in (200, 201, 202, 203, 204, 205, 206, 301, 308): + # Success responses and permanent redirects - cache for 1 day + response.headers.setdefault("Cache-Control", f"public, max-age={ONE_DAY}") + elif response.status in (302, 303, 307): + # Temporary redirects - no cache + response.headers.setdefault("Cache-Control", "no-cache") + # Note: 304 Not Modified falls through - no cache headers set + + return response diff --git a/server.py b/server.py index 3d323eaf8..43816a8cd 100644 --- a/server.py +++ b/server.py @@ -39,20 +39,15 @@ from typing import Optional, Union from api_server.routes.internal.internal_routes import InternalRoutes from protocol import BinaryEventTypes +# Import cache control middleware +from middleware.cache_middleware import cache_control + async def send_socket_catch_exception(function, message): try: await function(message) except (aiohttp.ClientError, aiohttp.ClientPayloadError, ConnectionResetError, BrokenPipeError, ConnectionError) as err: logging.warning("send error: {}".format(err)) -@web.middleware -async def cache_control(request: web.Request, handler): - response: web.Response = await handler(request) - if request.path.endswith('.js') or request.path.endswith('.css') or request.path.endswith('index.json'): - response.headers.setdefault('Cache-Control', 'no-cache') - return response - - @web.middleware async def compress_body(request: web.Request, handler): accept_encoding = request.headers.get("Accept-Encoding", "") diff --git a/tests-unit/server_test/test_cache_control.py b/tests-unit/server_test/test_cache_control.py new file mode 100644 index 000000000..8de59125a --- /dev/null +++ b/tests-unit/server_test/test_cache_control.py @@ -0,0 +1,255 @@ +"""Tests for server cache control middleware""" + +import pytest +from aiohttp import web +from aiohttp.test_utils import make_mocked_request +from typing import Dict, Any + +from middleware.cache_middleware import cache_control, ONE_HOUR, ONE_DAY, IMG_EXTENSIONS + +pytestmark = pytest.mark.asyncio # Apply asyncio mark to all tests + +# Test configuration data +CACHE_SCENARIOS = [ + # Image file scenarios + { + "name": "image_200_status", + "path": "/test.jpg", + "status": 200, + "expected_cache": f"public, max-age={ONE_DAY}", + "should_have_header": True, + }, + { + "name": "image_404_status", + "path": "/missing.jpg", + "status": 404, + "expected_cache": f"public, max-age={ONE_HOUR}", + "should_have_header": True, + }, + # JavaScript/CSS scenarios + { + "name": "js_no_cache", + "path": "/script.js", + "status": 200, + "expected_cache": "no-cache", + "should_have_header": True, + }, + { + "name": "css_no_cache", + "path": "/styles.css", + "status": 200, + "expected_cache": "no-cache", + "should_have_header": True, + }, + { + "name": "index_json_no_cache", + "path": "/api/index.json", + "status": 200, + "expected_cache": "no-cache", + "should_have_header": True, + }, + # Non-matching files + { + "name": "html_no_header", + "path": "/index.html", + "status": 200, + "expected_cache": None, + "should_have_header": False, + }, + { + "name": "txt_no_header", + "path": "/data.txt", + "status": 200, + "expected_cache": None, + "should_have_header": False, + }, + { + "name": "api_endpoint_no_header", + "path": "/api/endpoint", + "status": 200, + "expected_cache": None, + "should_have_header": False, + }, + { + "name": "pdf_no_header", + "path": "/file.pdf", + "status": 200, + "expected_cache": None, + "should_have_header": False, + }, +] + +# Status code scenarios for images +IMAGE_STATUS_SCENARIOS = [ + # Success statuses get long cache + {"status": 200, "expected": f"public, max-age={ONE_DAY}"}, + {"status": 201, "expected": f"public, max-age={ONE_DAY}"}, + {"status": 202, "expected": f"public, max-age={ONE_DAY}"}, + {"status": 204, "expected": f"public, max-age={ONE_DAY}"}, + {"status": 206, "expected": f"public, max-age={ONE_DAY}"}, + # Permanent redirects get long cache + {"status": 301, "expected": f"public, max-age={ONE_DAY}"}, + {"status": 308, "expected": f"public, max-age={ONE_DAY}"}, + # Temporary redirects get no cache + {"status": 302, "expected": "no-cache"}, + {"status": 303, "expected": "no-cache"}, + {"status": 307, "expected": "no-cache"}, + # 404 gets short cache + {"status": 404, "expected": f"public, max-age={ONE_HOUR}"}, +] + +# Case sensitivity test paths +CASE_SENSITIVITY_PATHS = ["/image.JPG", "/photo.PNG", "/pic.JpEg"] + +# Edge case test paths +EDGE_CASE_PATHS = [ + { + "name": "query_strings_ignored", + "path": "/image.jpg?v=123&size=large", + "expected": f"public, max-age={ONE_DAY}", + }, + { + "name": "multiple_dots_in_path", + "path": "/image.min.jpg", + "expected": f"public, max-age={ONE_DAY}", + }, + { + "name": "nested_paths_with_images", + "path": "/static/images/photo.jpg", + "expected": f"public, max-age={ONE_DAY}", + }, +] + + +class TestCacheControl: + """Test cache control middleware functionality""" + + @pytest.fixture + def status_handler_factory(self): + """Create a factory for handlers that return specific status codes""" + + def factory(status: int, headers: Dict[str, str] = None): + async def handler(request): + return web.Response(status=status, headers=headers or {}) + + return handler + + return factory + + @pytest.fixture + def mock_handler(self, status_handler_factory): + """Create a mock handler that returns a response with 200 status""" + return status_handler_factory(200) + + @pytest.fixture + def handler_with_existing_cache(self, status_handler_factory): + """Create a handler that returns response with existing Cache-Control header""" + return status_handler_factory(200, {"Cache-Control": "max-age=3600"}) + + async def assert_cache_header( + self, + response: web.Response, + expected_cache: str = None, + should_have_header: bool = True, + ): + """Helper to assert cache control headers""" + if should_have_header: + assert "Cache-Control" in response.headers + if expected_cache: + assert response.headers["Cache-Control"] == expected_cache + else: + assert "Cache-Control" not in response.headers + + # Parameterized tests + @pytest.mark.parametrize("scenario", CACHE_SCENARIOS, ids=lambda x: x["name"]) + async def test_cache_control_scenarios( + self, scenario: Dict[str, Any], status_handler_factory + ): + """Test various cache control scenarios""" + handler = status_handler_factory(scenario["status"]) + request = make_mocked_request("GET", scenario["path"]) + response = await cache_control(request, handler) + + assert response.status == scenario["status"] + await self.assert_cache_header( + response, scenario["expected_cache"], scenario["should_have_header"] + ) + + @pytest.mark.parametrize("ext", IMG_EXTENSIONS) + async def test_all_image_extensions(self, ext: str, mock_handler): + """Test all defined image extensions are handled correctly""" + request = make_mocked_request("GET", f"/image{ext}") + response = await cache_control(request, mock_handler) + + assert response.status == 200 + assert "Cache-Control" in response.headers + assert response.headers["Cache-Control"] == f"public, max-age={ONE_DAY}" + + @pytest.mark.parametrize( + "status_scenario", IMAGE_STATUS_SCENARIOS, ids=lambda x: f"status_{x['status']}" + ) + async def test_image_status_codes( + self, status_scenario: Dict[str, Any], status_handler_factory + ): + """Test different status codes for image requests""" + handler = status_handler_factory(status_scenario["status"]) + request = make_mocked_request("GET", "/image.jpg") + response = await cache_control(request, handler) + + assert response.status == status_scenario["status"] + assert "Cache-Control" in response.headers + assert response.headers["Cache-Control"] == status_scenario["expected"] + + @pytest.mark.parametrize("path", CASE_SENSITIVITY_PATHS) + async def test_case_insensitive_image_extension(self, path: str, mock_handler): + """Test that image extensions are matched case-insensitively""" + request = make_mocked_request("GET", path) + response = await cache_control(request, mock_handler) + + assert "Cache-Control" in response.headers + assert response.headers["Cache-Control"] == f"public, max-age={ONE_DAY}" + + @pytest.mark.parametrize("edge_case", EDGE_CASE_PATHS, ids=lambda x: x["name"]) + async def test_edge_cases(self, edge_case: Dict[str, str], mock_handler): + """Test edge cases like query strings, nested paths, etc.""" + request = make_mocked_request("GET", edge_case["path"]) + response = await cache_control(request, mock_handler) + + assert "Cache-Control" in response.headers + assert response.headers["Cache-Control"] == edge_case["expected"] + + # Header preservation tests (special cases not covered by parameterization) + async def test_js_preserves_existing_headers(self, handler_with_existing_cache): + """Test that .js files preserve existing Cache-Control headers""" + request = make_mocked_request("GET", "/script.js") + response = await cache_control(request, handler_with_existing_cache) + + # setdefault should preserve existing header + assert response.headers["Cache-Control"] == "max-age=3600" + + async def test_css_preserves_existing_headers(self, handler_with_existing_cache): + """Test that .css files preserve existing Cache-Control headers""" + request = make_mocked_request("GET", "/styles.css") + response = await cache_control(request, handler_with_existing_cache) + + # setdefault should preserve existing header + assert response.headers["Cache-Control"] == "max-age=3600" + + async def test_image_preserves_existing_headers(self, status_handler_factory): + """Test that image cache headers preserve existing Cache-Control""" + handler = status_handler_factory(200, {"Cache-Control": "private, no-cache"}) + request = make_mocked_request("GET", "/image.jpg") + response = await cache_control(request, handler) + + # setdefault should preserve existing header + assert response.headers["Cache-Control"] == "private, no-cache" + + async def test_304_not_modified_inherits_cache(self, status_handler_factory): + """Test that 304 Not Modified doesn't set cache headers for images""" + handler = status_handler_factory(304, {"Cache-Control": "max-age=7200"}) + request = make_mocked_request("GET", "/not-modified.jpg") + response = await cache_control(request, handler) + + assert response.status == 304 + # Should preserve existing cache header, not override + assert response.headers["Cache-Control"] == "max-age=7200" From 2ee7879a0bdbf507bfd26f8b36eca2fef147c29d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 5 Sep 2025 11:57:35 -0700 Subject: [PATCH 243/759] Fix lowvram issues with hunyuan3d 2.1 (#9735) --- comfy/ldm/hunyuan3dv2_1/hunyuandit.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py index 48575bb3c..ca1a83001 100644 --- a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py +++ b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py @@ -3,6 +3,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from comfy.ldm.modules.attention import optimized_attention +import comfy.model_management class GELU(nn.Module): @@ -88,7 +89,7 @@ class MoEGate(nn.Module): hidden_states = hidden_states.view(-1, hidden_states.size(-1)) # get logits and pass it to softmax - logits = F.linear(hidden_states, self.weight, bias = None) + logits = F.linear(hidden_states, comfy.model_management.cast_to(self.weight, dtype=hidden_states.dtype, device=hidden_states.device), bias = None) scores = logits.softmax(dim = -1) topk_weight, topk_idx = torch.topk(scores, k = self.top_k, dim = -1, sorted = False) @@ -255,7 +256,7 @@ class TimestepEmbedder(nn.Module): cond_embed = self.cond_proj(condition) timestep_embed = timestep_embed + cond_embed - time_conditioned = self.mlp(timestep_embed.to(self.mlp[0].weight.device)) + time_conditioned = self.mlp(timestep_embed) # for broadcasting with image tokens return time_conditioned.unsqueeze(1) From ea6cdd2631fbca6ed81b95796150c32c9a029f0d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 5 Sep 2025 22:05:05 -0700 Subject: [PATCH 244/759] Print all fast options in --help (#9737) --- comfy/cli_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 72eeaea9a..cc1f12482 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -145,7 +145,7 @@ class PerformanceFeature(enum.Enum): CublasOps = "cublas_ops" AutoTune = "autotune" -parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops") +parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature)))) parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.") parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.") From 27a0fcccc376fef6f035ed97664db8aa7e2e6117 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 6 Sep 2025 20:25:22 -0700 Subject: [PATCH 245/759] Enable bf16 VAE on RDNA4. (#9746) --- comfy/model_management.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index d08aee1fe..17516b6ed 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -289,6 +289,21 @@ def is_amd(): return True return False +def amd_min_version(device=None, min_rdna_version=0): + if not is_amd(): + return False + + arch = torch.cuda.get_device_properties(device).gcnArchName + if arch.startswith('gfx') and len(arch) == 7: + try: + cmp_rdna_version = int(arch[4]) + 2 + except: + cmp_rdna_version = 0 + if cmp_rdna_version >= min_rdna_version: + return True + + return False + MIN_WEIGHT_MEMORY_RATIO = 0.4 if is_nvidia(): MIN_WEIGHT_MEMORY_RATIO = 0.0 @@ -905,7 +920,9 @@ def vae_dtype(device=None, allowed_dtypes=[]): # NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32 # slowness still a problem on pytorch nightly 2.9.0.dev20250720+rocm6.4 tested on RDNA3 - if d == torch.bfloat16 and (not is_amd()) and should_use_bf16(device): + # also a problem on RDNA4 except fp32 is also slow there. + # This is due to large bf16 convolutions being extremely slow. + if d == torch.bfloat16 and ((not is_amd()) or amd_min_version(device, min_rdna_version=4)) and should_use_bf16(device): return d return torch.float32 From bcbd7884e3af5ee8b6ab848da2a3123f247d6114 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 6 Sep 2025 21:29:38 -0700 Subject: [PATCH 246/759] Don't enable pytorch attention on AMD if triton isn't available. (#9747) --- comfy/model_management.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 17516b6ed..cb6580f73 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -22,6 +22,7 @@ from enum import Enum from comfy.cli_args import args, PerformanceFeature import torch import sys +import importlib import platform import weakref import gc @@ -336,12 +337,13 @@ try: logging.info("AMD arch: {}".format(arch)) logging.info("ROCm version: {}".format(rocm_version)) if args.use_split_cross_attention == False and args.use_quad_cross_attention == False: - if torch_version_numeric >= (2, 7): # works on 2.6 but doesn't actually seem to improve much - if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx950 - ENABLE_PYTORCH_ATTENTION = True -# if torch_version_numeric >= (2, 8): -# if any((a in arch) for a in ["gfx1201"]): -# ENABLE_PYTORCH_ATTENTION = True + if importlib.util.find_spec('triton') is not None: # AMD efficient attention implementation depends on triton. TODO: better way of detecting if it's compiled in or not. + if torch_version_numeric >= (2, 7): # works on 2.6 but doesn't actually seem to improve much + if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx950 + ENABLE_PYTORCH_ATTENTION = True +# if torch_version_numeric >= (2, 8): +# if any((a in arch) for a in ["gfx1201"]): +# ENABLE_PYTORCH_ATTENTION = True if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4): if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]): # TODO: more arches SUPPORT_FP8_OPS = True From fb763d43332aaf15e96350cf1c25e2a1927423f1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 7 Sep 2025 18:16:29 -0700 Subject: [PATCH 247/759] Fix amd_min_version crash when cpu device. (#9754) --- comfy/model_management.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy/model_management.py b/comfy/model_management.py index cb6580f73..bbfc3c7a1 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -294,6 +294,9 @@ def amd_min_version(device=None, min_rdna_version=0): if not is_amd(): return False + if is_device_cpu(device): + return False + arch = torch.cuda.get_device_properties(device).gcnArchName if arch.startswith('gfx') and len(arch) == 7: try: From bd1d9bcd5fcdb8379ce5a8020cb2b8f42de1b7c7 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Mon, 8 Sep 2025 12:07:04 -0700 Subject: [PATCH 248/759] Add ZeroDivisionError catch for EasyCache logging statement (#9768) --- comfy_extras/nodes_easycache.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py index 9d2988f5f..c170e9fd9 100644 --- a/comfy_extras/nodes_easycache.py +++ b/comfy_extras/nodes_easycache.py @@ -162,7 +162,12 @@ def easycache_sample_wrapper(executor, *args, **kwargs): logging.info(f"{easycache.name} [verbose] - output_change_rates {len(output_change_rates)}: {output_change_rates}") logging.info(f"{easycache.name} [verbose] - approx_output_change_rates {len(approx_output_change_rates)}: {approx_output_change_rates}") total_steps = len(args[3])-1 - logging.info(f"{easycache.name} - skipped {easycache.total_steps_skipped}/{total_steps} steps ({total_steps/(total_steps-easycache.total_steps_skipped):.2f}x speedup).") + # catch division by zero for log statement; sucks to crash after all sampling is done + try: + speedup = total_steps/(total_steps-easycache.total_steps_skipped) + except ZeroDivisionError: + speedup = 1.0 + logging.info(f"{easycache.name} - skipped {easycache.total_steps_skipped}/{total_steps} steps ({speedup:.2f}x speedup).") easycache.reset() guider.model_options = orig_model_options From 97652d26b81f83fc9a3675be55ede7762fafb7bd Mon Sep 17 00:00:00 2001 From: contentis Date: Mon, 8 Sep 2025 21:08:18 +0200 Subject: [PATCH 249/759] Add explicit casting in apply_rope for Qwen VL (#9759) --- comfy/text_encoders/llama.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 4c976058f..5e11956b5 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -128,11 +128,12 @@ def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=N def apply_rope(xq, xk, freqs_cis): + org_dtype = xq.dtype cos = freqs_cis[0] sin = freqs_cis[1] q_embed = (xq * cos) + (rotate_half(xq) * sin) k_embed = (xk * cos) + (rotate_half(xk) * sin) - return q_embed, k_embed + return q_embed.to(org_dtype), k_embed.to(org_dtype) class Attention(nn.Module): From 103a12cb668303f197b22f52bb2981bb1539beea Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 8 Sep 2025 14:30:26 -0700 Subject: [PATCH 250/759] Support qwen inpaint controlnet. (#9772) --- comfy/controlnet.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/comfy/controlnet.py b/comfy/controlnet.py index e3dfedf55..f08ff4b36 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -253,7 +253,10 @@ class ControlNet(ControlBase): to_concat = [] for c in self.extra_concat_orig: c = c.to(self.cond_hint.device) - c = comfy.utils.common_upscale(c, self.cond_hint.shape[3], self.cond_hint.shape[2], self.upscale_algorithm, "center") + c = comfy.utils.common_upscale(c, self.cond_hint.shape[-1], self.cond_hint.shape[-2], self.upscale_algorithm, "center") + if c.ndim < self.cond_hint.ndim: + c = c.unsqueeze(2) + c = comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[2], dim=2) to_concat.append(comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[0])) self.cond_hint = torch.cat([self.cond_hint] + to_concat, dim=1) @@ -585,11 +588,18 @@ def load_controlnet_flux_instantx(sd, model_options={}): def load_controlnet_qwen_instantx(sd, model_options={}): model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options) - control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config) + control_latent_channels = sd.get("controlnet_x_embedder.weight").shape[1] + + extra_condition_channels = 0 + concat_mask = False + if control_latent_channels == 68: #inpaint controlnet + extra_condition_channels = control_latent_channels - 64 + concat_mask = True + control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(extra_condition_channels=extra_condition_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config) control_model = controlnet_load_state_dict(control_model, sd) latent_format = comfy.latent_formats.Wan21() extra_conds = [] - control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds) + control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds) return control def convert_mistoline(sd): From f73b176abd6b3e3b587b668fa6748107deef311c Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 9 Sep 2025 21:40:29 +0300 Subject: [PATCH 251/759] add ByteDance video API nodes (#9712) --- comfy_api_nodes/nodes_bytedance.py | 697 ++++++++++++++++++++++++++++- 1 file changed, 686 insertions(+), 11 deletions(-) diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index fb6aba7fa..064df2d10 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -1,6 +1,7 @@ import logging +import math from enum import Enum -from typing import Optional +from typing import Literal, Optional, Type, Union from typing_extensions import override import torch @@ -10,28 +11,53 @@ from comfy_api.latest import ComfyExtension, io as comfy_io from comfy_api_nodes.util.validation_utils import ( validate_image_aspect_ratio_range, get_number_of_images, + validate_image_dimensions, ) from comfy_api_nodes.apis.client import ( ApiEndpoint, + EmptyRequest, HttpMethod, SynchronousOperation, + PollingOperation, + T, +) +from comfy_api_nodes.apinode_utils import ( + download_url_to_image_tensor, + download_url_to_video_output, + upload_images_to_comfyapi, + validate_string, + image_tensor_pair_to_batch, ) -from comfy_api_nodes.apinode_utils import download_url_to_image_tensor, upload_images_to_comfyapi, validate_string -BYTEPLUS_ENDPOINT = "/proxy/byteplus/api/v3/images/generations" +BYTEPLUS_IMAGE_ENDPOINT = "/proxy/byteplus/api/v3/images/generations" + +# Long-running tasks endpoints(e.g., video) +BYTEPLUS_TASK_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks" +BYTEPLUS_TASK_STATUS_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks" # + /{task_id} class Text2ImageModelName(str, Enum): - seedream3 = "seedream-3-0-t2i-250415" + seedream_3 = "seedream-3-0-t2i-250415" class Image2ImageModelName(str, Enum): - seededit3 = "seededit-3-0-i2i-250628" + seededit_3 = "seededit-3-0-i2i-250628" + + +class Text2VideoModelName(str, Enum): + seedance_1_pro = "seedance-1-0-pro-250528" + seedance_1_lite = "seedance-1-0-lite-t2v-250428" + + +class Image2VideoModelName(str, Enum): + """note(August 31): Pro model only supports FirstFrame: https://docs.byteplus.com/en/docs/ModelArk/1520757""" + seedance_1_pro = "seedance-1-0-pro-250528" + seedance_1_lite = "seedance-1-0-lite-i2v-250428" class Text2ImageTaskCreationRequest(BaseModel): - model: Text2ImageModelName = Text2ImageModelName.seedream3 + model: Text2ImageModelName = Text2ImageModelName.seedream_3 prompt: str = Field(...) response_format: Optional[str] = Field("url") size: Optional[str] = Field(None) @@ -41,7 +67,7 @@ class Text2ImageTaskCreationRequest(BaseModel): class Image2ImageTaskCreationRequest(BaseModel): - model: Image2ImageModelName = Image2ImageModelName.seededit3 + model: Image2ImageModelName = Image2ImageModelName.seededit_3 prompt: str = Field(...) response_format: Optional[str] = Field("url") image: str = Field(..., description="Base64 encoded string or image URL") @@ -58,6 +84,52 @@ class ImageTaskCreationResponse(BaseModel): error: dict = Field({}, description="Contains `code` and `message` fields in case of error.") +class TaskTextContent(BaseModel): + type: str = Field("text") + text: str = Field(...) + + +class TaskImageContentUrl(BaseModel): + url: str = Field(...) + + +class TaskImageContent(BaseModel): + type: str = Field("image_url") + image_url: TaskImageContentUrl = Field(...) + role: Optional[Literal["first_frame", "last_frame", "reference_image"]] = Field(None) + + +class Text2VideoTaskCreationRequest(BaseModel): + model: Text2VideoModelName = Text2VideoModelName.seedance_1_pro + content: list[TaskTextContent] = Field(..., min_length=1) + + +class Image2VideoTaskCreationRequest(BaseModel): + model: Image2VideoModelName = Image2VideoModelName.seedance_1_pro + content: list[Union[TaskTextContent, TaskImageContent]] = Field(..., min_length=2) + + +class TaskCreationResponse(BaseModel): + id: str = Field(...) + + +class TaskStatusError(BaseModel): + code: str = Field(...) + message: str = Field(...) + + +class TaskStatusResult(BaseModel): + video_url: str = Field(...) + + +class TaskStatusResponse(BaseModel): + id: str = Field(...) + model: str = Field(...) + status: Literal["queued", "running", "cancelled", "succeeded", "failed"] = Field(...) + error: Optional[TaskStatusError] = Field(None) + content: Optional[TaskStatusResult] = Field(None) + + RECOMMENDED_PRESETS = [ ("1024x1024 (1:1)", 1024, 1024), ("864x1152 (3:4)", 864, 1152), @@ -71,6 +143,25 @@ RECOMMENDED_PRESETS = [ ("Custom", None, None), ] +# The time in this dictionary are given for 10 seconds duration. +VIDEO_TASKS_EXECUTION_TIME = { + "seedance-1-0-lite-t2v-250428": { + "480p": 40, + "720p": 60, + "1080p": 90, + }, + "seedance-1-0-lite-i2v-250428": { + "480p": 40, + "720p": 60, + "1080p": 90, + }, + "seedance-1-0-pro-250528": { + "480p": 70, + "720p": 85, + "1080p": 115, + }, +} + def get_image_url_from_response(response: ImageTaskCreationResponse) -> str: if response.error: @@ -81,6 +172,42 @@ def get_image_url_from_response(response: ImageTaskCreationResponse) -> str: return response.data[0]["url"] +def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]: + """Returns the video URL from the task status response if it exists.""" + if hasattr(response, "content") and response.content: + return response.content.video_url + return None + + +async def poll_until_finished( + auth_kwargs: dict[str, str], + task_id: str, + estimated_duration: Optional[int] = None, + node_id: Optional[str] = None, +) -> TaskStatusResponse: + """Polls the ByteDance API endpoint until the task reaches a terminal state, then returns the response.""" + return await PollingOperation( + poll_endpoint=ApiEndpoint( + path=f"{BYTEPLUS_TASK_STATUS_ENDPOINT}/{task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=TaskStatusResponse, + ), + completed_statuses=[ + "succeeded", + ], + failed_statuses=[ + "cancelled", + "failed", + ], + status_extractor=lambda response: response.status, + auth_kwargs=auth_kwargs, + result_url_extractor=get_video_url_from_task_status, + estimated_duration=estimated_duration, + node_id=node_id, + ).execute() + + class ByteDanceImageNode(comfy_io.ComfyNode): @classmethod @@ -94,7 +221,7 @@ class ByteDanceImageNode(comfy_io.ComfyNode): comfy_io.Combo.Input( "model", options=[model.value for model in Text2ImageModelName], - default=Text2ImageModelName.seedream3.value, + default=Text2ImageModelName.seedream_3.value, tooltip="Model name", ), comfy_io.String.Input( @@ -203,7 +330,7 @@ class ByteDanceImageNode(comfy_io.ComfyNode): } response = await SynchronousOperation( endpoint=ApiEndpoint( - path=BYTEPLUS_ENDPOINT, + path=BYTEPLUS_IMAGE_ENDPOINT, method=HttpMethod.POST, request_model=Text2ImageTaskCreationRequest, response_model=ImageTaskCreationResponse, @@ -227,7 +354,7 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode): comfy_io.Combo.Input( "model", options=[model.value for model in Image2ImageModelName], - default=Image2ImageModelName.seededit3.value, + default=Image2ImageModelName.seededit_3.value, tooltip="Model name", ), comfy_io.Image.Input( @@ -313,7 +440,7 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode): ) response = await SynchronousOperation( endpoint=ApiEndpoint( - path=BYTEPLUS_ENDPOINT, + path=BYTEPLUS_IMAGE_ENDPOINT, method=HttpMethod.POST, request_model=Image2ImageTaskCreationRequest, response_model=ImageTaskCreationResponse, @@ -324,12 +451,560 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode): return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) +class ByteDanceTextToVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ByteDanceTextToVideoNode", + display_name="ByteDance Text to Video", + category="api node/video/ByteDance", + description="Generate video using ByteDance models via api based on prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in Text2VideoModelName], + default=Text2VideoModelName.seedance_1_pro.value, + tooltip="Model name", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="The text prompt used to generate the video.", + ), + comfy_io.Combo.Input( + "resolution", + options=["480p", "720p", "1080p"], + tooltip="The resolution of the output video.", + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=["16:9", "4:3", "1:1", "3:4", "9:16", "21:9"], + tooltip="The aspect ratio of the output video.", + ), + comfy_io.Int.Input( + "duration", + default=5, + min=3, + max=12, + step=1, + tooltip="The duration of the output video in seconds.", + display_mode=comfy_io.NumberDisplay.slider, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + optional=True, + ), + comfy_io.Boolean.Input( + "camera_fixed", + default=False, + tooltip="Specifies whether to fix the camera. The platform appends an instruction " + "to fix the camera to your prompt, but does not guarantee the actual effect.", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the video.", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + resolution: str, + aspect_ratio: str, + duration: int, + seed: int, + camera_fixed: bool, + watermark: bool, + ) -> comfy_io.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"]) + + prompt = ( + f"{prompt} " + f"--resolution {resolution} " + f"--ratio {aspect_ratio} " + f"--duration {duration} " + f"--seed {seed} " + f"--camerafixed {str(camera_fixed).lower()} " + f"--watermark {str(watermark).lower()}" + ) + + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + return await process_video_task( + request_model=Text2VideoTaskCreationRequest, + payload=Text2VideoTaskCreationRequest( + model=model, + content=[TaskTextContent(text=prompt)], + ), + auth_kwargs=auth_kwargs, + node_id=cls.hidden.unique_id, + estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))), + ) + + +class ByteDanceImageToVideoNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ByteDanceImageToVideoNode", + display_name="ByteDance Image to Video", + category="api node/video/ByteDance", + description="Generate video using ByteDance models via api based on image and prompt", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[model.value for model in Image2VideoModelName], + default=Image2VideoModelName.seedance_1_pro.value, + tooltip="Model name", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="The text prompt used to generate the video.", + ), + comfy_io.Image.Input( + "image", + tooltip="First frame to be used for the video.", + ), + comfy_io.Combo.Input( + "resolution", + options=["480p", "720p", "1080p"], + tooltip="The resolution of the output video.", + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=["adaptive", "16:9", "4:3", "1:1", "3:4", "9:16", "21:9"], + tooltip="The aspect ratio of the output video.", + ), + comfy_io.Int.Input( + "duration", + default=5, + min=3, + max=12, + step=1, + tooltip="The duration of the output video in seconds.", + display_mode=comfy_io.NumberDisplay.slider, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + optional=True, + ), + comfy_io.Boolean.Input( + "camera_fixed", + default=False, + tooltip="Specifies whether to fix the camera. The platform appends an instruction " + "to fix the camera to your prompt, but does not guarantee the actual effect.", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the video.", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + image: torch.Tensor, + resolution: str, + aspect_ratio: str, + duration: int, + seed: int, + camera_fixed: bool, + watermark: bool, + ) -> comfy_io.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"]) + validate_image_dimensions(image, min_width=300, min_height=300, max_width=6000, max_height=6000) + validate_image_aspect_ratio_range(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 + + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + + image_url = (await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=auth_kwargs))[0] + + prompt = ( + f"{prompt} " + f"--resolution {resolution} " + f"--ratio {aspect_ratio} " + f"--duration {duration} " + f"--seed {seed} " + f"--camerafixed {str(camera_fixed).lower()} " + f"--watermark {str(watermark).lower()}" + ) + + return await process_video_task( + request_model=Image2VideoTaskCreationRequest, + payload=Image2VideoTaskCreationRequest( + model=model, + content=[TaskTextContent(text=prompt), TaskImageContent(image_url=TaskImageContentUrl(url=image_url))], + ), + auth_kwargs=auth_kwargs, + node_id=cls.hidden.unique_id, + estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))), + ) + + +class ByteDanceFirstLastFrameNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ByteDanceFirstLastFrameNode", + display_name="ByteDance First-Last-Frame to Video", + category="api node/video/ByteDance", + description="Generate video using prompt and first and last frames.", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[Image2VideoModelName.seedance_1_lite.value], + default=Image2VideoModelName.seedance_1_lite.value, + tooltip="Model name", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="The text prompt used to generate the video.", + ), + comfy_io.Image.Input( + "first_frame", + tooltip="First frame to be used for the video.", + ), + comfy_io.Image.Input( + "last_frame", + tooltip="Last frame to be used for the video.", + ), + comfy_io.Combo.Input( + "resolution", + options=["480p", "720p", "1080p"], + tooltip="The resolution of the output video.", + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=["adaptive", "16:9", "4:3", "1:1", "3:4", "9:16", "21:9"], + tooltip="The aspect ratio of the output video.", + ), + comfy_io.Int.Input( + "duration", + default=5, + min=3, + max=12, + step=1, + tooltip="The duration of the output video in seconds.", + display_mode=comfy_io.NumberDisplay.slider, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + optional=True, + ), + comfy_io.Boolean.Input( + "camera_fixed", + default=False, + tooltip="Specifies whether to fix the camera. The platform appends an instruction " + "to fix the camera to your prompt, but does not guarantee the actual effect.", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the video.", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + first_frame: torch.Tensor, + last_frame: torch.Tensor, + resolution: str, + aspect_ratio: str, + duration: int, + seed: int, + camera_fixed: bool, + watermark: bool, + ) -> comfy_io.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"]) + for i in (first_frame, last_frame): + validate_image_dimensions(i, min_width=300, min_height=300, max_width=6000, max_height=6000) + validate_image_aspect_ratio_range(i, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 + + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + + download_urls = await upload_images_to_comfyapi( + image_tensor_pair_to_batch(first_frame, last_frame), + max_images=2, + mime_type="image/png", + auth_kwargs=auth_kwargs, + ) + + prompt = ( + f"{prompt} " + f"--resolution {resolution} " + f"--ratio {aspect_ratio} " + f"--duration {duration} " + f"--seed {seed} " + f"--camerafixed {str(camera_fixed).lower()} " + f"--watermark {str(watermark).lower()}" + ) + + return await process_video_task( + request_model=Image2VideoTaskCreationRequest, + payload=Image2VideoTaskCreationRequest( + model=model, + content=[ + TaskTextContent(text=prompt), + TaskImageContent(image_url=TaskImageContentUrl(url=str(download_urls[0])), role="first_frame"), + TaskImageContent(image_url=TaskImageContentUrl(url=str(download_urls[1])), role="last_frame"), + ], + ), + auth_kwargs=auth_kwargs, + node_id=cls.hidden.unique_id, + estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))), + ) + + +class ByteDanceImageReferenceNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ByteDanceImageReferenceNode", + display_name="ByteDance Reference Images to Video", + category="api node/video/ByteDance", + description="Generate video using prompt and reference images.", + inputs=[ + comfy_io.Combo.Input( + "model", + options=[Image2VideoModelName.seedance_1_lite.value], + default=Image2VideoModelName.seedance_1_lite.value, + tooltip="Model name", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="The text prompt used to generate the video.", + ), + comfy_io.Image.Input( + "images", + tooltip="One to four images.", + ), + comfy_io.Combo.Input( + "resolution", + options=["480p", "720p"], + tooltip="The resolution of the output video.", + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=["adaptive", "16:9", "4:3", "1:1", "3:4", "9:16", "21:9"], + tooltip="The aspect ratio of the output video.", + ), + comfy_io.Int.Input( + "duration", + default=5, + min=3, + max=12, + step=1, + tooltip="The duration of the output video in seconds.", + display_mode=comfy_io.NumberDisplay.slider, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the video.", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + images: torch.Tensor, + resolution: str, + aspect_ratio: str, + duration: int, + seed: int, + watermark: bool, + ) -> comfy_io.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "watermark"]) + for image in images: + validate_image_dimensions(image, min_width=300, min_height=300, max_width=6000, max_height=6000) + validate_image_aspect_ratio_range(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 + + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + + image_urls = await upload_images_to_comfyapi( + images, max_images=4, mime_type="image/png", auth_kwargs=auth_kwargs + ) + + prompt = ( + f"{prompt} " + f"--resolution {resolution} " + f"--ratio {aspect_ratio} " + f"--duration {duration} " + f"--seed {seed} " + f"--watermark {str(watermark).lower()}" + ) + x = [ + TaskTextContent(text=prompt), + *[TaskImageContent(image_url=TaskImageContentUrl(url=str(i)), role="reference_image") for i in image_urls] + ] + return await process_video_task( + request_model=Image2VideoTaskCreationRequest, + payload=Image2VideoTaskCreationRequest( + model=model, + content=x, + ), + auth_kwargs=auth_kwargs, + node_id=cls.hidden.unique_id, + estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))), + ) + + +async def process_video_task( + request_model: Type[T], + payload: Union[Text2VideoTaskCreationRequest, Image2VideoTaskCreationRequest], + auth_kwargs: dict, + node_id: str, + estimated_duration: int | None, +) -> comfy_io.NodeOutput: + initial_response = await SynchronousOperation( + endpoint=ApiEndpoint( + path=BYTEPLUS_TASK_ENDPOINT, + method=HttpMethod.POST, + request_model=request_model, + response_model=TaskCreationResponse, + ), + request=payload, + auth_kwargs=auth_kwargs, + ).execute() + response = await poll_until_finished( + auth_kwargs, + initial_response.id, + estimated_duration=estimated_duration, + node_id=node_id, + ) + return comfy_io.NodeOutput(await download_url_to_video_output(get_video_url_from_task_status(response))) + + +def raise_if_text_params(prompt: str, text_params: list[str]) -> None: + for i in text_params: + if f"--{i} " in prompt: + raise ValueError( + f"--{i} is not allowed in the prompt, use the appropriated widget input to change this value." + ) + + class ByteDanceExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: return [ ByteDanceImageNode, ByteDanceImageEditNode, + ByteDanceTextToVideoNode, + ByteDanceImageToVideoNode, + ByteDanceFirstLastFrameNode, + ByteDanceImageReferenceNode, ] async def comfy_entrypoint() -> ByteDanceExtension: From b288fb0db88281532d813d4fb83f715f88b54ffc Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 9 Sep 2025 15:09:56 -0700 Subject: [PATCH 252/759] Small refactor of some vae code. (#9787) --- comfy/ldm/modules/diffusionmodules/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index 1fd12b35a..8f598a848 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -145,7 +145,7 @@ class Downsample(nn.Module): class ResnetBlock(nn.Module): def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False, - dropout, temb_channels=512, conv_op=ops.Conv2d): + dropout=0.0, temb_channels=512, conv_op=ops.Conv2d): super().__init__() self.in_channels = in_channels out_channels = in_channels if out_channels is None else out_channels @@ -183,7 +183,7 @@ class ResnetBlock(nn.Module): stride=1, padding=0) - def forward(self, x, temb): + def forward(self, x, temb=None): h = x h = self.norm1(h) h = self.swish(h) From 206595f854c67538d5921d36326acbfeb69c5ac2 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Tue, 9 Sep 2025 18:33:36 -0700 Subject: [PATCH 253/759] Change validate_inputs' output typehint to 'bool | str' and update docstrings (#9786) --- comfy_api/latest/_io.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index e0ee943a7..f770109d5 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -1190,13 +1190,18 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal): raise NotImplementedError @classmethod - def validate_inputs(cls, **kwargs) -> bool: - """Optionally, define this function to validate inputs; equivalent to V1's VALIDATE_INPUTS.""" + def validate_inputs(cls, **kwargs) -> bool | str: + """Optionally, define this function to validate inputs; equivalent to V1's VALIDATE_INPUTS. + + If the function returns a string, it will be used as the validation error message for the node. + """ raise NotImplementedError @classmethod def fingerprint_inputs(cls, **kwargs) -> Any: - """Optionally, define this function to fingerprint inputs; equivalent to V1's IS_CHANGED.""" + """Optionally, define this function to fingerprint inputs; equivalent to V1's IS_CHANGED. + + If this function returns the same value as last run, the node will not be executed.""" raise NotImplementedError @classmethod From 5c33872e2f355e51adf212d5b5c83815b7fe77b0 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 9 Sep 2025 21:23:47 -0700 Subject: [PATCH 254/759] Fix issue on old torch. (#9791) --- comfy/ldm/hunyuan3dv2_1/hunyuandit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py index ca1a83001..d48d9d642 100644 --- a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py +++ b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py @@ -426,7 +426,7 @@ class HunYuanDiTBlock(nn.Module): text_states_dim=1024, qk_norm=False, norm_layer=nn.LayerNorm, - qk_norm_layer=nn.RMSNorm, + qk_norm_layer=True, qkv_bias=True, skip_connection=True, timested_modulate=False, From 85e34643f874aec2ab9eed6a8499f2aefa81486e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 9 Sep 2025 23:05:07 -0700 Subject: [PATCH 255/759] Support hunyuan image 2.1 regular model. (#9792) --- comfy/latent_formats.py | 5 + comfy/ldm/hunyuan_video/model.py | 102 +- comfy/ldm/hunyuan_video/vae.py | 136 ++ comfy/model_base.py | 24 + comfy/model_detection.py | 28 +- comfy/sd.py | 31 +- comfy/supported_models.py | 27 +- .../byt5_config_small_glyph.json | 22 + .../byt5_tokenizer/added_tokens.json | 127 ++ .../byt5_tokenizer/special_tokens_map.json | 150 +++ .../byt5_tokenizer/tokenizer_config.json | 1163 +++++++++++++++++ comfy/text_encoders/hunyuan_image.py | 100 ++ comfy_extras/nodes_hunyuan.py | 15 + nodes.py | 6 +- 14 files changed, 1906 insertions(+), 30 deletions(-) create mode 100644 comfy/ldm/hunyuan_video/vae.py create mode 100644 comfy/text_encoders/byt5_config_small_glyph.json create mode 100644 comfy/text_encoders/byt5_tokenizer/added_tokens.json create mode 100644 comfy/text_encoders/byt5_tokenizer/special_tokens_map.json create mode 100644 comfy/text_encoders/byt5_tokenizer/tokenizer_config.json create mode 100644 comfy/text_encoders/hunyuan_image.py diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 0d84994b0..859ae8421 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -533,6 +533,11 @@ class Wan22(Wan21): 0.3971, 1.0600, 0.3943, 0.5537, 0.5444, 0.4089, 0.7468, 0.7744 ]).view(1, self.latent_channels, 1, 1, 1) +class HunyuanImage21(LatentFormat): + latent_channels = 64 + latent_dimensions = 2 + scale_factor = 0.75289 + class Hunyuan3Dv2(LatentFormat): latent_channels = 64 latent_dimensions = 1 diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py index da1011596..ca289c5bd 100644 --- a/comfy/ldm/hunyuan_video/model.py +++ b/comfy/ldm/hunyuan_video/model.py @@ -40,6 +40,7 @@ class HunyuanVideoParams: patch_size: list qkv_bias: bool guidance_embed: bool + byt5: bool class SelfAttentionRef(nn.Module): @@ -161,6 +162,30 @@ class TokenRefiner(nn.Module): x = self.individual_token_refiner(x, c, mask) return x + +class ByT5Mapper(nn.Module): + def __init__(self, in_dim, out_dim, hidden_dim, out_dim1, use_res=False, dtype=None, device=None, operations=None): + super().__init__() + self.layernorm = operations.LayerNorm(in_dim, dtype=dtype, device=device) + self.fc1 = operations.Linear(in_dim, hidden_dim, dtype=dtype, device=device) + self.fc2 = operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device) + self.fc3 = operations.Linear(out_dim, out_dim1, dtype=dtype, device=device) + self.use_res = use_res + self.act_fn = nn.GELU() + + def forward(self, x): + if self.use_res: + res = x + x = self.layernorm(x) + x = self.fc1(x) + x = self.act_fn(x) + x = self.fc2(x) + x2 = self.act_fn(x) + x2 = self.fc3(x2) + if self.use_res: + x2 = x2 + res + return x2 + class HunyuanVideo(nn.Module): """ Transformer model for flow matching on sequences. @@ -185,9 +210,13 @@ class HunyuanVideo(nn.Module): self.num_heads = params.num_heads self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim) - self.img_in = comfy.ldm.modules.diffusionmodules.mmdit.PatchEmbed(None, self.patch_size, self.in_channels, self.hidden_size, conv3d=True, dtype=dtype, device=device, operations=operations) + self.img_in = comfy.ldm.modules.diffusionmodules.mmdit.PatchEmbed(None, self.patch_size, self.in_channels, self.hidden_size, conv3d=len(self.patch_size) == 3, dtype=dtype, device=device, operations=operations) self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) - self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations) + if params.vec_in_dim is not None: + self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations) + else: + self.vector_in = None + self.guidance_in = ( MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity() ) @@ -215,6 +244,18 @@ class HunyuanVideo(nn.Module): ] ) + if params.byt5: + self.byt5_in = ByT5Mapper( + in_dim=1472, + out_dim=2048, + hidden_dim=2048, + out_dim1=self.hidden_size, + use_res=False, + dtype=dtype, device=device, operations=operations + ) + else: + self.byt5_in = None + if final_layer: self.final_layer = LastLayer(self.hidden_size, self.patch_size[-1], self.out_channels, dtype=dtype, device=device, operations=operations) @@ -226,7 +267,8 @@ class HunyuanVideo(nn.Module): txt_ids: Tensor, txt_mask: Tensor, timesteps: Tensor, - y: Tensor, + y: Tensor = None, + txt_byt5=None, guidance: Tensor = None, guiding_frame_index=None, ref_latent=None, @@ -250,13 +292,17 @@ class HunyuanVideo(nn.Module): if guiding_frame_index is not None: token_replace_vec = self.time_in(timestep_embedding(guiding_frame_index, 256, time_factor=1.0)) - vec_ = self.vector_in(y[:, :self.params.vec_in_dim]) - vec = torch.cat([(vec_ + token_replace_vec).unsqueeze(1), (vec_ + vec).unsqueeze(1)], dim=1) + if self.vector_in is not None: + vec_ = self.vector_in(y[:, :self.params.vec_in_dim]) + vec = torch.cat([(vec_ + token_replace_vec).unsqueeze(1), (vec_ + vec).unsqueeze(1)], dim=1) + else: + vec = torch.cat([(token_replace_vec).unsqueeze(1), (vec).unsqueeze(1)], dim=1) frame_tokens = (initial_shape[-1] // self.patch_size[-1]) * (initial_shape[-2] // self.patch_size[-2]) modulation_dims = [(0, frame_tokens, 0), (frame_tokens, None, 1)] modulation_dims_txt = [(0, None, 1)] else: - vec = vec + self.vector_in(y[:, :self.params.vec_in_dim]) + if self.vector_in is not None: + vec = vec + self.vector_in(y[:, :self.params.vec_in_dim]) modulation_dims = None modulation_dims_txt = None @@ -269,6 +315,12 @@ class HunyuanVideo(nn.Module): txt = self.txt_in(txt, timesteps, txt_mask) + if self.byt5_in is not None and txt_byt5 is not None: + txt_byt5 = self.byt5_in(txt_byt5) + txt_byt5_ids = torch.zeros((txt_ids.shape[0], txt_byt5.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype) + txt = torch.cat((txt, txt_byt5), dim=1) + txt_ids = torch.cat((txt_ids, txt_byt5_ids), dim=1) + ids = torch.cat((img_ids, txt_ids), dim=1) pe = self.pe_embedder(ids) @@ -328,12 +380,16 @@ class HunyuanVideo(nn.Module): img = self.final_layer(img, vec, modulation_dims=modulation_dims) # (N, T, patch_size ** 2 * out_channels) - shape = initial_shape[-3:] + shape = initial_shape[-len(self.patch_size):] for i in range(len(shape)): shape[i] = shape[i] // self.patch_size[i] img = img.reshape([img.shape[0]] + shape + [self.out_channels] + self.patch_size) - img = img.permute(0, 4, 1, 5, 2, 6, 3, 7) - img = img.reshape(initial_shape[0], self.out_channels, initial_shape[2], initial_shape[3], initial_shape[4]) + if img.ndim == 8: + img = img.permute(0, 4, 1, 5, 2, 6, 3, 7) + img = img.reshape(initial_shape[0], self.out_channels, initial_shape[2], initial_shape[3], initial_shape[4]) + else: + img = img.permute(0, 3, 1, 4, 2, 5) + img = img.reshape(initial_shape[0], self.out_channels, initial_shape[2], initial_shape[3]) return img def img_ids(self, x): @@ -348,16 +404,30 @@ class HunyuanVideo(nn.Module): img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).reshape(1, 1, -1) return repeat(img_ids, "t h w c -> b (t h w) c", b=bs) - def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs): + def img_ids_2d(self, x): + bs, c, h, w = x.shape + patch_size = self.patch_size + h_len = ((h + (patch_size[0] // 2)) // patch_size[0]) + w_len = ((w + (patch_size[1] // 2)) // patch_size[1]) + img_ids = torch.zeros((h_len, w_len, 2), device=x.device, dtype=x.dtype) + img_ids[:, :, 0] = img_ids[:, :, 0] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) + img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) + return repeat(img_ids, "h w c -> b (h w) c", b=bs) + + def forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs): return comfy.patcher_extension.WrapperExecutor.new_class_executor( self._forward, self, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) - ).execute(x, timestep, context, y, guidance, attention_mask, guiding_frame_index, ref_latent, control, transformer_options, **kwargs) + ).execute(x, timestep, context, y, txt_byt5, guidance, attention_mask, guiding_frame_index, ref_latent, control, transformer_options, **kwargs) - def _forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs): - bs, c, t, h, w = x.shape - img_ids = self.img_ids(x) - txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype) - out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, guiding_frame_index, ref_latent, control=control, transformer_options=transformer_options) + def _forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs): + bs = x.shape[0] + if len(self.patch_size) == 3: + img_ids = self.img_ids(x) + txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype) + else: + img_ids = self.img_ids_2d(x) + txt_ids = torch.zeros((bs, context.shape[1], 2), device=x.device, dtype=x.dtype) + out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, guidance, guiding_frame_index, ref_latent, control=control, transformer_options=transformer_options) return out diff --git a/comfy/ldm/hunyuan_video/vae.py b/comfy/ldm/hunyuan_video/vae.py new file mode 100644 index 000000000..8d406089b --- /dev/null +++ b/comfy/ldm/hunyuan_video/vae.py @@ -0,0 +1,136 @@ +import torch.nn as nn +import torch.nn.functional as F +from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock +import comfy.ops +ops = comfy.ops.disable_weight_init + + +class PixelShuffle2D(nn.Module): + def __init__(self, in_dim, out_dim, op=ops.Conv2d): + super().__init__() + self.conv = op(in_dim, out_dim >> 2, 3, 1, 1) + self.ratio = (in_dim << 2) // out_dim + + def forward(self, x): + b, c, h, w = x.shape + h2, w2 = h >> 1, w >> 1 + y = self.conv(x).view(b, -1, h2, 2, w2, 2).permute(0, 3, 5, 1, 2, 4).reshape(b, -1, h2, w2) + r = x.view(b, c, h2, 2, w2, 2).permute(0, 3, 5, 1, 2, 4).reshape(b, c << 2, h2, w2) + return y + r.view(b, y.shape[1], self.ratio, h2, w2).mean(2) + + +class PixelUnshuffle2D(nn.Module): + def __init__(self, in_dim, out_dim, op=ops.Conv2d): + super().__init__() + self.conv = op(in_dim, out_dim << 2, 3, 1, 1) + self.scale = (out_dim << 2) // in_dim + + def forward(self, x): + b, c, h, w = x.shape + h2, w2 = h << 1, w << 1 + y = self.conv(x).view(b, 2, 2, -1, h, w).permute(0, 3, 4, 1, 5, 2).reshape(b, -1, h2, w2) + r = x.repeat_interleave(self.scale, 1).view(b, 2, 2, -1, h, w).permute(0, 3, 4, 1, 5, 2).reshape(b, -1, h2, w2) + return y + r + + +class Encoder(nn.Module): + def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks, + ffactor_spatial, downsample_match_channel=True, **_): + super().__init__() + self.z_channels = z_channels + self.block_out_channels = block_out_channels + self.num_res_blocks = num_res_blocks + self.conv_in = ops.Conv2d(in_channels, block_out_channels[0], 3, 1, 1) + + self.down = nn.ModuleList() + ch = block_out_channels[0] + depth = (ffactor_spatial >> 1).bit_length() + + for i, tgt in enumerate(block_out_channels): + stage = nn.Module() + stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, + out_channels=tgt, + temb_channels=0, + conv_op=ops.Conv2d) + for j in range(num_res_blocks)]) + ch = tgt + if i < depth: + nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and downsample_match_channel else ch + stage.downsample = PixelShuffle2D(ch, nxt, ops.Conv2d) + ch = nxt + self.down.append(stage) + + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=ops.Conv2d) + self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv2d) + self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=ops.Conv2d) + + self.norm_out = nn.GroupNorm(32, ch, 1e-6, True) + self.conv_out = ops.Conv2d(ch, z_channels << 1, 3, 1, 1) + + def forward(self, x): + x = self.conv_in(x) + + for stage in self.down: + for blk in stage.block: + x = blk(x) + if hasattr(stage, 'downsample'): + x = stage.downsample(x) + + x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x))) + + b, c, h, w = x.shape + grp = c // (self.z_channels << 1) + skip = x.view(b, c // grp, grp, h, w).mean(2) + + return self.conv_out(F.silu(self.norm_out(x))) + skip + + +class Decoder(nn.Module): + def __init__(self, z_channels, out_channels, block_out_channels, num_res_blocks, + ffactor_spatial, upsample_match_channel=True, **_): + super().__init__() + block_out_channels = block_out_channels[::-1] + self.z_channels = z_channels + self.block_out_channels = block_out_channels + self.num_res_blocks = num_res_blocks + + ch = block_out_channels[0] + self.conv_in = ops.Conv2d(z_channels, ch, 3, 1, 1) + + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=ops.Conv2d) + self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv2d) + self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=ops.Conv2d) + + self.up = nn.ModuleList() + depth = (ffactor_spatial >> 1).bit_length() + + for i, tgt in enumerate(block_out_channels): + stage = nn.Module() + stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, + out_channels=tgt, + temb_channels=0, + conv_op=ops.Conv2d) + for j in range(num_res_blocks + 1)]) + ch = tgt + if i < depth: + nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and upsample_match_channel else ch + stage.upsample = PixelUnshuffle2D(ch, nxt, ops.Conv2d) + ch = nxt + self.up.append(stage) + + self.norm_out = nn.GroupNorm(32, ch, 1e-6, True) + self.conv_out = ops.Conv2d(ch, out_channels, 3, 1, 1) + + def forward(self, z): + x = self.conv_in(z) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1) + x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x))) + + for stage in self.up: + for blk in stage.block: + x = blk(x) + if hasattr(stage, 'upsample'): + x = stage.upsample(x) + + return self.conv_out(F.silu(self.norm_out(x))) diff --git a/comfy/model_base.py b/comfy/model_base.py index 39a3344bc..993ff65e6 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1408,3 +1408,27 @@ class QwenImage(BaseModel): if ref_latents is not None: out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16]) return out + +class HunyuanImage21(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan_video.model.HunyuanVideo) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None: + if torch.numel(attention_mask) != attention_mask.sum(): + out['attention_mask'] = comfy.conds.CONDRegular(attention_mask) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + + conditioning_byt5small = kwargs.get("conditioning_byt5small", None) + if conditioning_byt5small is not None: + out['txt_byt5'] = comfy.conds.CONDRegular(conditioning_byt5small) + + guidance = kwargs.get("guidance", 6.0) + if guidance is not None: + out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance])) + + return out diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 75552ede9..dbcbe5f5a 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -136,20 +136,32 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): if '{}txt_in.individual_token_refiner.blocks.0.norm1.weight'.format(key_prefix) in state_dict_keys: #Hunyuan Video dit_config = {} + in_w = state_dict['{}img_in.proj.weight'.format(key_prefix)] + out_w = state_dict['{}final_layer.linear.weight'.format(key_prefix)] dit_config["image_model"] = "hunyuan_video" - dit_config["in_channels"] = state_dict['{}img_in.proj.weight'.format(key_prefix)].shape[1] #SkyReels img2video has 32 input channels - dit_config["patch_size"] = [1, 2, 2] - dit_config["out_channels"] = 16 - dit_config["vec_in_dim"] = 768 - dit_config["context_in_dim"] = 4096 - dit_config["hidden_size"] = 3072 + dit_config["in_channels"] = in_w.shape[1] #SkyReels img2video has 32 input channels + dit_config["patch_size"] = list(in_w.shape[2:]) + dit_config["out_channels"] = out_w.shape[0] // math.prod(dit_config["patch_size"]) + if '{}vector_in.in_layer.weight'.format(key_prefix) in state_dict: + dit_config["vec_in_dim"] = 768 + dit_config["axes_dim"] = [16, 56, 56] + else: + dit_config["vec_in_dim"] = None + dit_config["axes_dim"] = [64, 64] + + dit_config["context_in_dim"] = state_dict['{}txt_in.input_embedder.weight'.format(key_prefix)].shape[1] + dit_config["hidden_size"] = in_w.shape[0] dit_config["mlp_ratio"] = 4.0 - dit_config["num_heads"] = 24 + dit_config["num_heads"] = in_w.shape[0] // 128 dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.') dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.') - dit_config["axes_dim"] = [16, 56, 56] dit_config["theta"] = 256 dit_config["qkv_bias"] = True + if '{}byt5_in.fc1.weight'.format(key_prefix) in state_dict: + dit_config["byt5"] = True + else: + dit_config["byt5"] = False + guidance_keys = list(filter(lambda a: a.startswith("{}guidance_in.".format(key_prefix)), state_dict_keys)) dit_config["guidance_embed"] = len(guidance_keys) > 0 return dit_config diff --git a/comfy/sd.py b/comfy/sd.py index be5aa8dc8..9dd9a74d4 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -17,6 +17,7 @@ import comfy.ldm.wan.vae import comfy.ldm.wan.vae2_2 import comfy.ldm.hunyuan3d.vae import comfy.ldm.ace.vae.music_dcae_pipeline +import comfy.ldm.hunyuan_video.vae import yaml import math import os @@ -48,6 +49,7 @@ import comfy.text_encoders.hidream import comfy.text_encoders.ace import comfy.text_encoders.omnigen2 import comfy.text_encoders.qwen_image +import comfy.text_encoders.hunyuan_image import comfy.model_patcher import comfy.lora @@ -328,6 +330,19 @@ class VAE: self.first_stage_model = StageC_coder() self.downscale_ratio = 32 self.latent_channels = 16 + elif "decoder.conv_in.weight" in sd and sd['decoder.conv_in.weight'].shape[1] == 64: + ddconfig = {"block_out_channels": [128, 256, 512, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 32, "downsample_match_channel": True, "upsample_match_channel": True} + self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] + self.downscale_ratio = 32 + self.upscale_ratio = 32 + self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] + self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"}, + encoder_config={'target': "comfy.ldm.hunyuan_video.vae.Encoder", 'params': ddconfig}, + decoder_config={'target': "comfy.ldm.hunyuan_video.vae.Decoder", 'params': ddconfig}) + + self.memory_used_encode = lambda shape, dtype: (700 * shape[2] * shape[3]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (700 * shape[2] * shape[3] * 32 * 32) * model_management.dtype_size(dtype) + elif "decoder.conv_in.weight" in sd: #default SD1.x/SD2.x VAE parameters ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0} @@ -785,6 +800,7 @@ class CLIPType(Enum): ACE = 16 OMNIGEN2 = 17 QWEN_IMAGE = 18 + HUNYUAN_IMAGE = 19 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}): @@ -806,6 +822,7 @@ class TEModel(Enum): GEMMA_2_2B = 9 QWEN25_3B = 10 QWEN25_7B = 11 + BYT5_SMALL_GLYPH = 12 def detect_te_model(sd): if "text_model.encoder.layers.30.mlp.fc1.weight" in sd: @@ -823,6 +840,9 @@ def detect_te_model(sd): if 'encoder.block.23.layer.1.DenseReluDense.wi.weight' in sd: return TEModel.T5_XXL_OLD if "encoder.block.0.layer.0.SelfAttention.k.weight" in sd: + weight = sd['encoder.block.0.layer.0.SelfAttention.k.weight'] + if weight.shape[0] == 384: + return TEModel.BYT5_SMALL_GLYPH return TEModel.T5_BASE if 'model.layers.0.post_feedforward_layernorm.weight' in sd: return TEModel.GEMMA_2_2B @@ -937,8 +957,12 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.omnigen2.te(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.omnigen2.Omnigen2Tokenizer elif te_model == TEModel.QWEN25_7B: - clip_target.clip = comfy.text_encoders.qwen_image.te(**llama_detect(clip_data)) - clip_target.tokenizer = comfy.text_encoders.qwen_image.QwenImageTokenizer + if clip_type == CLIPType.HUNYUAN_IMAGE: + clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer + else: + clip_target.clip = comfy.text_encoders.qwen_image.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.qwen_image.QwenImageTokenizer else: # clip_l if clip_type == CLIPType.SD3: @@ -982,6 +1006,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=clip_l, clip_g=clip_g, t5=t5, llama=llama, **t5_kwargs, **llama_kwargs) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer + elif clip_type == CLIPType.HUNYUAN_IMAGE: + clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer else: clip_target.clip = sdxl_clip.SDXLClipModel clip_target.tokenizer = sdxl_clip.SDXLTokenizer diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 75dad277d..aa953b462 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -20,6 +20,7 @@ import comfy.text_encoders.wan import comfy.text_encoders.ace import comfy.text_encoders.omnigen2 import comfy.text_encoders.qwen_image +import comfy.text_encoders.hunyuan_image from . import supported_models_base from . import latent_formats @@ -1295,7 +1296,31 @@ class QwenImage(supported_models_base.BASE): hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect)) +class HunyuanImage21(HunyuanVideo): + unet_config = { + "image_model": "hunyuan_video", + "vec_in_dim": None, + } -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] + sampling_settings = { + "shift": 5.0, + } + + latent_format = latent_formats.HunyuanImage21 + + memory_usage_factor = 7.7 + + supported_inference_dtypes = [torch.bfloat16, torch.float32] + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.HunyuanImage21(self, device=device) + return out + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) + return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect)) + +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy/text_encoders/byt5_config_small_glyph.json b/comfy/text_encoders/byt5_config_small_glyph.json new file mode 100644 index 000000000..0239c7164 --- /dev/null +++ b/comfy/text_encoders/byt5_config_small_glyph.json @@ -0,0 +1,22 @@ +{ + "d_ff": 3584, + "d_kv": 64, + "d_model": 1472, + "decoder_start_token_id": 0, + "dropout_rate": 0.1, + "eos_token_id": 1, + "dense_act_fn": "gelu_pytorch_tanh", + "initializer_factor": 1.0, + "is_encoder_decoder": true, + "is_gated_act": true, + "layer_norm_epsilon": 1e-06, + "model_type": "t5", + "num_decoder_layers": 4, + "num_heads": 6, + "num_layers": 12, + "output_past": true, + "pad_token_id": 0, + "relative_attention_num_buckets": 32, + "tie_word_embeddings": false, + "vocab_size": 1510 +} diff --git a/comfy/text_encoders/byt5_tokenizer/added_tokens.json b/comfy/text_encoders/byt5_tokenizer/added_tokens.json new file mode 100644 index 000000000..93c190b56 --- /dev/null +++ b/comfy/text_encoders/byt5_tokenizer/added_tokens.json @@ -0,0 +1,127 @@ +{ + "": 259, + "": 359, + "": 360, + "": 361, + "": 362, + "": 363, + "": 364, + "": 365, + "": 366, + "": 367, + "": 368, + "": 269, + "": 369, + "": 370, + "": 371, + "": 372, + "": 373, + "": 374, + "": 375, + "": 376, + "": 377, + "": 378, + "": 270, + "": 379, + "": 380, + "": 381, + "": 382, + "": 383, + "": 271, + "": 272, + "": 273, + "": 274, + "": 275, + "": 276, + "": 277, + "": 278, + "": 260, + "": 279, + "": 280, + "": 281, + "": 282, + "": 283, + "": 284, + "": 285, + "": 286, + "": 287, + "": 288, + "": 261, + "": 289, + "": 290, + "": 291, + "": 292, + "": 293, + "": 294, + "": 295, + "": 296, + "": 297, + "": 298, + "": 262, + "": 299, + "": 300, + "": 301, + "": 302, + "": 303, + "": 304, + "": 305, + "": 306, + "": 307, + "": 308, + "": 263, + "": 309, + "": 310, + "": 311, + "": 312, + "": 313, + "": 314, + "": 315, + "": 316, + "": 317, + "": 318, + "": 264, + "": 319, + "": 320, + "": 321, + "": 322, + "": 323, + "": 324, + "": 325, + "": 326, + "": 327, + "": 328, + "": 265, + "": 329, + "": 330, + "": 331, + "": 332, + "": 333, + "": 334, + "": 335, + "": 336, + "": 337, + "": 338, + "": 266, + "": 339, + "": 340, + "": 341, + "": 342, + "": 343, + "": 344, + "": 345, + "": 346, + "": 347, + "": 348, + "": 267, + "": 349, + "": 350, + "": 351, + "": 352, + "": 353, + "": 354, + "": 355, + "": 356, + "": 357, + "": 358, + "": 268 +} diff --git a/comfy/text_encoders/byt5_tokenizer/special_tokens_map.json b/comfy/text_encoders/byt5_tokenizer/special_tokens_map.json new file mode 100644 index 000000000..04fd58b5f --- /dev/null +++ b/comfy/text_encoders/byt5_tokenizer/special_tokens_map.json @@ -0,0 +1,150 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/comfy/text_encoders/byt5_tokenizer/tokenizer_config.json b/comfy/text_encoders/byt5_tokenizer/tokenizer_config.json new file mode 100644 index 000000000..5b1fe24c1 --- /dev/null +++ b/comfy/text_encoders/byt5_tokenizer/tokenizer_config.json @@ -0,0 +1,1163 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "259": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "260": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "261": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "262": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "263": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "264": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "265": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "266": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "267": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "268": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "269": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "270": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "271": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "272": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "273": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "274": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "275": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "276": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "277": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "278": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "279": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "280": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "281": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "282": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "283": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "284": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "285": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "286": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "287": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "288": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "289": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "290": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "291": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "292": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "293": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "294": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "295": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "296": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "297": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "298": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "299": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "300": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "301": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "302": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "303": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "304": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "305": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "306": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "307": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "308": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "309": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "310": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "311": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "312": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "313": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "314": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "315": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "316": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "317": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "318": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "319": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "320": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "321": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "322": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "323": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "324": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "325": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "326": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "327": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "328": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "329": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "330": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "331": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "332": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "333": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "334": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "335": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "336": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "337": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "338": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "339": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "340": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "341": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "342": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "343": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "344": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "345": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "346": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "347": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "348": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "349": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "350": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "351": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "352": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "353": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "354": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "355": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "356": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "357": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "358": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "359": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "360": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "361": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "362": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "363": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "364": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "365": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "366": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "367": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "368": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "369": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "370": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "371": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "372": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "373": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "374": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "375": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "376": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "377": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "378": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "379": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "380": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "381": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "382": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "383": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_ids": 0, + "extra_special_tokens": {}, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "tokenizer_class": "ByT5Tokenizer", + "unk_token": "" +} diff --git a/comfy/text_encoders/hunyuan_image.py b/comfy/text_encoders/hunyuan_image.py new file mode 100644 index 000000000..be396cae7 --- /dev/null +++ b/comfy/text_encoders/hunyuan_image.py @@ -0,0 +1,100 @@ +from comfy import sd1_clip +import comfy.text_encoders.llama +from .qwen_image import QwenImageTokenizer, QwenImageTEModel +from transformers import ByT5Tokenizer +import os +import re + +class ByT5SmallTokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "byt5_tokenizer") + super().__init__(tokenizer_path, pad_with_end=False, embedding_size=1472, embedding_key='byt5_small', tokenizer_class=ByT5Tokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data) + +class HunyuanImageTokenizer(QwenImageTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data) + self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>" + # self.llama_template_images = "{}" + self.byt5 = ByT5SmallTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data) + + def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs): + out = super().tokenize_with_weights(text, return_word_ids, **kwargs) + + # ByT5 processing for HunyuanImage + text_prompt_texts = [] + pattern_quote_single = r'\'(.*?)\'' + pattern_quote_double = r'\"(.*?)\"' + pattern_quote_chinese_single = r'‘(.*?)’' + pattern_quote_chinese_double = r'“(.*?)”' + + matches_quote_single = re.findall(pattern_quote_single, text) + matches_quote_double = re.findall(pattern_quote_double, text) + matches_quote_chinese_single = re.findall(pattern_quote_chinese_single, text) + matches_quote_chinese_double = re.findall(pattern_quote_chinese_double, text) + + text_prompt_texts.extend(matches_quote_single) + text_prompt_texts.extend(matches_quote_double) + text_prompt_texts.extend(matches_quote_chinese_single) + text_prompt_texts.extend(matches_quote_chinese_double) + + if len(text_prompt_texts) > 0: + out['byt5'] = self.byt5.tokenize_with_weights(''.join(map(lambda a: 'Text "{}". '.format(a), text_prompt_texts)), return_word_ids, **kwargs) + return out + +class Qwen25_7BVLIModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options={}): + llama_scaled_fp8 = model_options.get("qwen_scaled_fp8", None) + if llama_scaled_fp8 is not None: + model_options = model_options.copy() + model_options["scaled_fp8"] = llama_scaled_fp8 + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) + + +class ByT5SmallModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}): + textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "byt5_config_small_glyph.json") + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, model_options=model_options, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=True, zero_out_masked=True) + + +class HunyuanImageTEModel(QwenImageTEModel): + def __init__(self, byt5=True, device="cpu", dtype=None, model_options={}): + super(QwenImageTEModel, self).__init__(device=device, dtype=dtype, name="qwen25_7b", clip_model=Qwen25_7BVLIModel, model_options=model_options) + + if byt5: + self.byt5_small = ByT5SmallModel(device=device, dtype=dtype, model_options=model_options) + else: + self.byt5_small = None + + def encode_token_weights(self, token_weight_pairs): + cond, p, extra = super().encode_token_weights(token_weight_pairs) + if self.byt5_small is not None and "byt5" in token_weight_pairs: + out = self.byt5_small.encode_token_weights(token_weight_pairs["byt5"]) + extra["conditioning_byt5small"] = out[0] + return cond, p, extra + + def set_clip_options(self, options): + super().set_clip_options(options) + if self.byt5_small is not None: + self.byt5_small.set_clip_options(options) + + def reset_clip_options(self): + super().reset_clip_options() + if self.byt5_small is not None: + self.byt5_small.reset_clip_options() + + def load_sd(self, sd): + if "encoder.block.0.layer.0.SelfAttention.o.weight" in sd: + return self.byt5_small.load_sd(sd) + else: + return super().load_sd(sd) + +def te(byt5=True, dtype_llama=None, llama_scaled_fp8=None): + class QwenImageTEModel_(HunyuanImageTEModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: + model_options = model_options.copy() + model_options["qwen_scaled_fp8"] = llama_scaled_fp8 + if dtype_llama is not None: + dtype = dtype_llama + super().__init__(byt5=byt5, device=device, dtype=dtype, model_options=model_options) + return QwenImageTEModel_ diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index d7278e7a7..ce031ceb2 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -113,6 +113,20 @@ class HunyuanImageToVideo: out_latent["samples"] = latent return (positive, out_latent) +class EmptyHunyuanImageLatent: + @classmethod + def INPUT_TYPES(s): + return {"required": { "width": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), + "height": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), + "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} + RETURN_TYPES = ("LATENT",) + FUNCTION = "generate" + + CATEGORY = "latent" + + def generate(self, width, height, batch_size=1): + latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device()) + return ({"samples":latent}, ) NODE_CLASS_MAPPINGS = { @@ -120,4 +134,5 @@ NODE_CLASS_MAPPINGS = { "TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo, "EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo, "HunyuanImageToVideo": HunyuanImageToVideo, + "EmptyHunyuanImageLatent": EmptyHunyuanImageLatent, } diff --git a/nodes.py b/nodes.py index 6c2f9dd14..2befb4b75 100644 --- a/nodes.py +++ b/nodes.py @@ -925,7 +925,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), @@ -953,7 +953,7 @@ class DualCLIPLoader: def INPUT_TYPES(s): return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ), "clip_name2": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream"], ), + "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), @@ -963,7 +963,7 @@ class DualCLIPLoader: CATEGORY = "advanced/loaders" - DESCRIPTION = "[Recipes]\n\nsdxl: clip-l, clip-g\nsd3: clip-l, clip-g / clip-l, t5 / clip-g, t5\nflux: clip-l, t5\nhidream: at least one of t5 or llama, recommended t5 and llama" + DESCRIPTION = "[Recipes]\n\nsdxl: clip-l, clip-g\nsd3: clip-l, clip-g / clip-l, t5 / clip-g, t5\nflux: clip-l, t5\nhidream: at least one of t5 or llama, recommended t5 and llama\nhunyuan_image: qwen2.5vl 7b and byt5 small" def load_clip(self, clip_name1, clip_name2, type, device="default"): clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION) From 70fc0425b36515926c6414aee9f2269b27880cc2 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 10 Sep 2025 14:09:16 +0800 Subject: [PATCH 256/759] Update template to 0.1.76 (#9793) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3008a5dc3..ea1931d78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.11 -comfyui-workflow-templates==0.1.75 +comfyui-workflow-templates==0.1.76 comfyui-embedded-docs==0.2.6 torch torchsde From 543888d3d84a6ec4c4273838d5179845840e3226 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 9 Sep 2025 23:15:34 -0700 Subject: [PATCH 257/759] Fix lowvram issue with hunyuan image vae. (#9794) --- comfy/ldm/hunyuan_video/vae.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/hunyuan_video/vae.py b/comfy/ldm/hunyuan_video/vae.py index 8d406089b..40c12b183 100644 --- a/comfy/ldm/hunyuan_video/vae.py +++ b/comfy/ldm/hunyuan_video/vae.py @@ -65,7 +65,7 @@ class Encoder(nn.Module): self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv2d) self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=ops.Conv2d) - self.norm_out = nn.GroupNorm(32, ch, 1e-6, True) + self.norm_out = ops.GroupNorm(32, ch, 1e-6, True) self.conv_out = ops.Conv2d(ch, z_channels << 1, 3, 1, 1) def forward(self, x): @@ -120,7 +120,7 @@ class Decoder(nn.Module): ch = nxt self.up.append(stage) - self.norm_out = nn.GroupNorm(32, ch, 1e-6, True) + self.norm_out = ops.GroupNorm(32, ch, 1e-6, True) self.conv_out = ops.Conv2d(ch, out_channels, 3, 1, 1) def forward(self, z): From de44b95db6c7ef107f26e7edf30748b608afaa48 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 10 Sep 2025 12:06:47 +0300 Subject: [PATCH 258/759] add StabilityAudio API nodes (#9749) --- comfy_api_nodes/apinode_utils.py | 65 +++++ comfy_api_nodes/apis/stability_api.py | 22 ++ comfy_api_nodes/nodes_stability.py | 312 ++++++++++++++++++++++- comfy_api_nodes/util/validation_utils.py | 20 +- 4 files changed, 415 insertions(+), 4 deletions(-) diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py index f953f86df..37438f835 100644 --- a/comfy_api_nodes/apinode_utils.py +++ b/comfy_api_nodes/apinode_utils.py @@ -518,6 +518,71 @@ async def upload_audio_to_comfyapi( return await upload_file_to_comfyapi(audio_bytes_io, filename, mime_type, auth_kwargs) +def f32_pcm(wav: torch.Tensor) -> torch.Tensor: + """Convert audio to float 32 bits PCM format. Copy-paste from nodes_audio.py file.""" + if wav.dtype.is_floating_point: + return wav + elif wav.dtype == torch.int16: + return wav.float() / (2 ** 15) + elif wav.dtype == torch.int32: + return wav.float() / (2 ** 31) + raise ValueError(f"Unsupported wav dtype: {wav.dtype}") + + +def audio_bytes_to_audio_input(audio_bytes: bytes,) -> dict: + """ + Decode any common audio container from bytes using PyAV and return + a Comfy AUDIO dict: {"waveform": [1, C, T] float32, "sample_rate": int}. + """ + with av.open(io.BytesIO(audio_bytes)) as af: + if not af.streams.audio: + raise ValueError("No audio stream found in response.") + stream = af.streams.audio[0] + + in_sr = int(stream.codec_context.sample_rate) + out_sr = in_sr + + frames: list[torch.Tensor] = [] + n_channels = stream.channels or 1 + + for frame in af.decode(streams=stream.index): + arr = frame.to_ndarray() # shape can be [C, T] or [T, C] or [T] + buf = torch.from_numpy(arr) + if buf.ndim == 1: + buf = buf.unsqueeze(0) # [T] -> [1, T] + elif buf.shape[0] != n_channels and buf.shape[-1] == n_channels: + buf = buf.transpose(0, 1).contiguous() # [T, C] -> [C, T] + elif buf.shape[0] != n_channels: + buf = buf.reshape(-1, n_channels).t().contiguous() # fallback to [C, T] + frames.append(buf) + + if not frames: + raise ValueError("Decoded zero audio frames.") + + wav = torch.cat(frames, dim=1) # [C, T] + wav = f32_pcm(wav) + return {"waveform": wav.unsqueeze(0).contiguous(), "sample_rate": out_sr} + + +def audio_input_to_mp3(audio: AudioInput) -> io.BytesIO: + waveform = audio["waveform"].cpu() + + output_buffer = io.BytesIO() + output_container = av.open(output_buffer, mode='w', format="mp3") + + out_stream = output_container.add_stream("libmp3lame", rate=audio["sample_rate"]) + out_stream.bit_rate = 320000 + + frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[0] == 1 else 'stereo') + frame.sample_rate = audio["sample_rate"] + frame.pts = 0 + output_container.mux(out_stream.encode(frame)) + output_container.mux(out_stream.encode(None)) + output_container.close() + output_buffer.seek(0) + return output_buffer + + def audio_to_base64_string( audio: AudioInput, container_format: str = "mp4", codec_name: str = "aac" ) -> str: diff --git a/comfy_api_nodes/apis/stability_api.py b/comfy_api_nodes/apis/stability_api.py index 47c87daec..718360187 100644 --- a/comfy_api_nodes/apis/stability_api.py +++ b/comfy_api_nodes/apis/stability_api.py @@ -125,3 +125,25 @@ class StabilityResultsGetResponse(BaseModel): class StabilityAsyncResponse(BaseModel): id: Optional[str] = Field(None) + + +class StabilityTextToAudioRequest(BaseModel): + model: str = Field(...) + prompt: str = Field(...) + duration: int = Field(190, ge=1, le=190) + seed: int = Field(0, ge=0, le=4294967294) + steps: int = Field(8, ge=4, le=8) + output_format: str = Field("wav") + + +class StabilityAudioToAudioRequest(StabilityTextToAudioRequest): + strength: float = Field(0.01, ge=0.01, le=1.0) + + +class StabilityAudioInpaintRequest(StabilityTextToAudioRequest): + mask_start: int = Field(30, ge=0, le=190) + mask_end: int = Field(190, ge=0, le=190) + + +class StabilityAudioResponse(BaseModel): + audio: Optional[str] = Field(None) diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index e05cb6bb2..5ba5ed986 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -2,7 +2,7 @@ from inspect import cleandoc from typing import Optional from typing_extensions import override -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, Input, io as comfy_io from comfy_api_nodes.apis.stability_api import ( StabilityUpscaleConservativeRequest, StabilityUpscaleCreativeRequest, @@ -15,6 +15,10 @@ from comfy_api_nodes.apis.stability_api import ( Stability_SD3_5_Model, Stability_SD3_5_GenerationMode, get_stability_style_presets, + StabilityTextToAudioRequest, + StabilityAudioToAudioRequest, + StabilityAudioInpaintRequest, + StabilityAudioResponse, ) from comfy_api_nodes.apis.client import ( ApiEndpoint, @@ -27,7 +31,10 @@ from comfy_api_nodes.apinode_utils import ( bytesio_to_image_tensor, tensor_to_bytesio, validate_string, + audio_bytes_to_audio_input, + audio_input_to_mp3, ) +from comfy_api_nodes.util.validation_utils import validate_audio_duration import torch import base64 @@ -649,6 +656,306 @@ class StabilityUpscaleFastNode(comfy_io.ComfyNode): return comfy_io.NodeOutput(returned_image) +class StabilityTextToAudio(comfy_io.ComfyNode): + """Generates high-quality music and sound effects from text descriptions.""" + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityTextToAudio", + display_name="Stability AI Text To Audio", + category="api node/audio/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Combo.Input( + "model", + options=["stable-audio-2.5"], + ), + comfy_io.String.Input("prompt", multiline=True, default=""), + comfy_io.Int.Input( + "duration", + default=190, + min=1, + max=190, + step=1, + tooltip="Controls the duration in seconds of the generated audio.", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967294, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="The random seed used for generation.", + optional=True, + ), + comfy_io.Int.Input( + "steps", + default=8, + min=4, + max=8, + step=1, + tooltip="Controls the number of sampling steps.", + optional=True, + ), + ], + outputs=[ + comfy_io.Audio.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute(cls, model: str, prompt: str, duration: int, seed: int, steps: int) -> comfy_io.NodeOutput: + validate_string(prompt, max_length=10000) + payload = StabilityTextToAudioRequest(prompt=prompt, model=model, duration=duration, seed=seed, steps=steps) + operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/stability/v2beta/audio/stable-audio-2/text-to-audio", + method=HttpMethod.POST, + request_model=StabilityTextToAudioRequest, + response_model=StabilityAudioResponse, + ), + request=payload, + content_type="multipart/form-data", + auth_kwargs= { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + ) + response_api = await operation.execute() + if not response_api.audio: + raise ValueError("No audio file was received in response.") + return comfy_io.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) + + +class StabilityAudioToAudio(comfy_io.ComfyNode): + """Transforms existing audio samples into new high-quality compositions using text instructions.""" + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityAudioToAudio", + display_name="Stability AI Audio To Audio", + category="api node/audio/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Combo.Input( + "model", + options=["stable-audio-2.5"], + ), + comfy_io.String.Input("prompt", multiline=True, default=""), + comfy_io.Audio.Input("audio", tooltip="Audio must be between 6 and 190 seconds long."), + comfy_io.Int.Input( + "duration", + default=190, + min=1, + max=190, + step=1, + tooltip="Controls the duration in seconds of the generated audio.", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967294, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="The random seed used for generation.", + optional=True, + ), + comfy_io.Int.Input( + "steps", + default=8, + min=4, + max=8, + step=1, + tooltip="Controls the number of sampling steps.", + optional=True, + ), + comfy_io.Float.Input( + "strength", + default=1, + min=0.01, + max=1.0, + step=0.01, + display_mode=comfy_io.NumberDisplay.slider, + tooltip="Parameter controls how much influence the audio parameter has on the generated audio.", + optional=True, + ), + ], + outputs=[ + comfy_io.Audio.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, model: str, prompt: str, audio: Input.Audio, duration: int, seed: int, steps: int, strength: float + ) -> comfy_io.NodeOutput: + validate_string(prompt, max_length=10000) + validate_audio_duration(audio, 6, 190) + payload = StabilityAudioToAudioRequest( + prompt=prompt, model=model, duration=duration, seed=seed, steps=steps, strength=strength + ) + operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/stability/v2beta/audio/stable-audio-2/audio-to-audio", + method=HttpMethod.POST, + request_model=StabilityAudioToAudioRequest, + response_model=StabilityAudioResponse, + ), + request=payload, + content_type="multipart/form-data", + files={"audio": audio_input_to_mp3(audio)}, + auth_kwargs= { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + ) + response_api = await operation.execute() + if not response_api.audio: + raise ValueError("No audio file was received in response.") + return comfy_io.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) + + +class StabilityAudioInpaint(comfy_io.ComfyNode): + """Transforms part of existing audio sample using text instructions.""" + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="StabilityAudioInpaint", + display_name="Stability AI Audio Inpaint", + category="api node/audio/Stability AI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Combo.Input( + "model", + options=["stable-audio-2.5"], + ), + comfy_io.String.Input("prompt", multiline=True, default=""), + comfy_io.Audio.Input("audio", tooltip="Audio must be between 6 and 190 seconds long."), + comfy_io.Int.Input( + "duration", + default=190, + min=1, + max=190, + step=1, + tooltip="Controls the duration in seconds of the generated audio.", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=4294967294, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="The random seed used for generation.", + optional=True, + ), + comfy_io.Int.Input( + "steps", + default=8, + min=4, + max=8, + step=1, + tooltip="Controls the number of sampling steps.", + optional=True, + ), + comfy_io.Int.Input( + "mask_start", + default=30, + min=0, + max=190, + step=1, + optional=True, + ), + comfy_io.Int.Input( + "mask_end", + default=190, + min=0, + max=190, + step=1, + optional=True, + ), + ], + outputs=[ + comfy_io.Audio.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + audio: Input.Audio, + duration: int, + seed: int, + steps: int, + mask_start: int, + mask_end: int, + ) -> comfy_io.NodeOutput: + validate_string(prompt, max_length=10000) + if mask_end <= mask_start: + raise ValueError(f"Value of mask_end({mask_end}) should be greater then mask_start({mask_start})") + validate_audio_duration(audio, 6, 190) + + payload = StabilityAudioInpaintRequest( + prompt=prompt, + model=model, + duration=duration, + seed=seed, + steps=steps, + mask_start=mask_start, + mask_end=mask_end, + ) + operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/stability/v2beta/audio/stable-audio-2/inpaint", + method=HttpMethod.POST, + request_model=StabilityAudioInpaintRequest, + response_model=StabilityAudioResponse, + ), + request=payload, + content_type="multipart/form-data", + files={"audio": audio_input_to_mp3(audio)}, + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + ) + response_api = await operation.execute() + if not response_api.audio: + raise ValueError("No audio file was received in response.") + return comfy_io.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) + + class StabilityExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: @@ -658,6 +965,9 @@ class StabilityExtension(ComfyExtension): StabilityUpscaleConservativeNode, StabilityUpscaleCreativeNode, StabilityUpscaleFastNode, + StabilityTextToAudio, + StabilityAudioToAudio, + StabilityAudioInpaint, ] diff --git a/comfy_api_nodes/util/validation_utils.py b/comfy_api_nodes/util/validation_utils.py index 606b794bf..ca913e9b3 100644 --- a/comfy_api_nodes/util/validation_utils.py +++ b/comfy_api_nodes/util/validation_utils.py @@ -2,7 +2,7 @@ import logging from typing import Optional import torch -from comfy_api.input.video_types import VideoInput +from comfy_api.latest import Input def get_image_dimensions(image: torch.Tensor) -> tuple[int, int]: @@ -101,7 +101,7 @@ def validate_aspect_ratio_closeness( def validate_video_dimensions( - video: VideoInput, + video: Input.Video, min_width: Optional[int] = None, max_width: Optional[int] = None, min_height: Optional[int] = None, @@ -126,7 +126,7 @@ def validate_video_dimensions( def validate_video_duration( - video: VideoInput, + video: Input.Video, min_duration: Optional[float] = None, max_duration: Optional[float] = None, ): @@ -151,3 +151,17 @@ def get_number_of_images(images): if isinstance(images, torch.Tensor): return images.shape[0] if images.ndim >= 4 else 1 return len(images) + + +def validate_audio_duration( + audio: Input.Audio, + min_duration: Optional[float] = None, + max_duration: Optional[float] = None, +) -> None: + sr = int(audio["sample_rate"]) + dur = int(audio["waveform"].shape[-1]) / sr + eps = 1.0 / sr + if min_duration is not None and dur + eps < min_duration: + raise ValueError(f"Audio duration must be at least {min_duration}s, got {dur + eps:.2f}s") + if max_duration is not None and dur - eps > max_duration: + raise ValueError(f"Audio duration must be at most {max_duration}s, got {dur - eps:.2f}s") From 8d7c930246bd33c32eb957b01ab0d364af6e81c0 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 10 Sep 2025 10:51:02 -0400 Subject: [PATCH 259/759] ComfyUI version v0.3.58 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 4cc3c8647..37361bd75 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.57" +__version__ = "0.3.58" diff --git a/pyproject.toml b/pyproject.toml index d75cd04a2..f02ab9126 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.57" +version = "0.3.58" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 9b0553809cbac084aac0576892aca3e448eb07c7 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 11 Sep 2025 00:13:18 +0300 Subject: [PATCH 260/759] add new ByteDanceSeedream (4.0) node (#9802) --- comfy_api_nodes/nodes_bytedance.py | 208 ++++++++++++++++++++++++++++- 1 file changed, 207 insertions(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index 064df2d10..369a3a4fe 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -77,6 +77,22 @@ class Image2ImageTaskCreationRequest(BaseModel): watermark: Optional[bool] = Field(True) +class Seedream4Options(BaseModel): + max_images: int = Field(15) + + +class Seedream4TaskCreationRequest(BaseModel): + model: str = Field("seedream-4-0-250828") + prompt: str = Field(...) + response_format: str = Field("url") + image: Optional[list[str]] = Field(None, description="Image URLs") + size: str = Field(...) + seed: int = Field(..., ge=0, le=2147483647) + sequential_image_generation: str = Field("disabled") + sequential_image_generation_options: Seedream4Options = Field(Seedream4Options(max_images=15)) + watermark: bool = Field(True) + + class ImageTaskCreationResponse(BaseModel): model: str = Field(...) created: int = Field(..., description="Unix timestamp (in seconds) indicating time when the request was created.") @@ -143,6 +159,19 @@ RECOMMENDED_PRESETS = [ ("Custom", None, None), ] +RECOMMENDED_PRESETS_SEEDREAM_4 = [ + ("2048x2048 (1:1)", 2048, 2048), + ("2304x1728 (4:3)", 2304, 1728), + ("1728x2304 (3:4)", 1728, 2304), + ("2560x1440 (16:9)", 2560, 1440), + ("1440x2560 (9:16)", 1440, 2560), + ("2496x1664 (3:2)", 2496, 1664), + ("1664x2496 (2:3)", 1664, 2496), + ("3024x1296 (21:9)", 3024, 1296), + ("4096x4096 (1:1)", 4096, 4096), + ("Custom", None, None), +] + # The time in this dictionary are given for 10 seconds duration. VIDEO_TASKS_EXECUTION_TIME = { "seedance-1-0-lite-t2v-250428": { @@ -348,7 +377,7 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode): return comfy_io.Schema( node_id="ByteDanceImageEditNode", display_name="ByteDance Image Edit", - category="api node/video/ByteDance", + category="api node/image/ByteDance", description="Edit images using ByteDance models via api based on prompt", inputs=[ comfy_io.Combo.Input( @@ -451,6 +480,182 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode): return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) +class ByteDanceSeedreamNode(comfy_io.ComfyNode): + + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="ByteDanceSeedreamNode", + display_name="ByteDance Seedream 4", + category="api node/image/ByteDance", + description="Unified text-to-image generation and precise single-sentence editing at up to 4K resolution.", + inputs=[ + comfy_io.Combo.Input( + "model", + options=["seedream-4-0-250828"], + tooltip="Model name", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for creating or editing an image.", + ), + comfy_io.Image.Input( + "image", + tooltip="Input image(s) for image-to-image generation. " + "List of 1-10 images for single or multi-reference generation.", + optional=True, + ), + comfy_io.Combo.Input( + "size_preset", + options=[label for label, _, _ in RECOMMENDED_PRESETS_SEEDREAM_4], + tooltip="Pick a recommended size. Select Custom to use the width and height below.", + ), + comfy_io.Int.Input( + "width", + default=2048, + min=1024, + max=4096, + step=64, + tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`", + optional=True, + ), + comfy_io.Int.Input( + "height", + default=2048, + min=1024, + max=4096, + step=64, + tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`", + optional=True, + ), + comfy_io.Combo.Input( + "sequential_image_generation", + options=["disabled", "auto"], + tooltip="Group image generation mode. " + "'disabled' generates a single image. " + "'auto' lets the model decide whether to generate multiple related images " + "(e.g., story scenes, character variations).", + optional=True, + ), + comfy_io.Int.Input( + "max_images", + default=1, + min=1, + max=15, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Maximum number of images to generate when sequential_image_generation='auto'. " + "Total images (input + generated) cannot exceed 15.", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the image.", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + image: torch.Tensor = None, + size_preset: str = RECOMMENDED_PRESETS_SEEDREAM_4[0][0], + width: int = 2048, + height: int = 2048, + sequential_image_generation: str = "disabled", + max_images: int = 1, + seed: int = 0, + watermark: bool = True, + ) -> comfy_io.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + w = h = None + for label, tw, th in RECOMMENDED_PRESETS_SEEDREAM_4: + if label == size_preset: + w, h = tw, th + break + + if w is None or h is None: + w, h = width, height + if not (1024 <= w <= 4096) or not (1024 <= h <= 4096): + raise ValueError( + f"Custom size out of range: {w}x{h}. " + "Both width and height must be between 1024 and 4096 pixels." + ) + n_input_images = get_number_of_images(image) if image is not None else 0 + if n_input_images > 10: + raise ValueError(f"Maximum of 10 reference images are supported, but {n_input_images} received.") + if sequential_image_generation == "auto" and n_input_images + max_images > 15: + raise ValueError( + "The maximum number of generated images plus the number of reference images cannot exceed 15." + ) + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + reference_images_urls = [] + if n_input_images: + for i in image: + validate_image_aspect_ratio_range(i, (1, 3), (3, 1)) + reference_images_urls = (await upload_images_to_comfyapi( + image, + max_images=n_input_images, + mime_type="image/png", + auth_kwargs=auth_kwargs, + )) + payload = Seedream4TaskCreationRequest( + model=model, + prompt=prompt, + image=reference_images_urls, + size=f"{w}x{h}", + seed=seed, + sequential_image_generation=sequential_image_generation, + sequential_image_generation_options=Seedream4Options(max_images=max_images), + watermark=watermark, + ) + response = await SynchronousOperation( + endpoint=ApiEndpoint( + path=BYTEPLUS_IMAGE_ENDPOINT, + method=HttpMethod.POST, + request_model=Seedream4TaskCreationRequest, + response_model=ImageTaskCreationResponse, + ), + request=payload, + auth_kwargs=auth_kwargs, + ).execute() + + if len(response.data) == 1: + return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) + return comfy_io.NodeOutput( + torch.cat([await download_url_to_image_tensor(str(i["url"])) for i in response.data]) + ) + + class ByteDanceTextToVideoNode(comfy_io.ComfyNode): @classmethod @@ -1001,6 +1206,7 @@ class ByteDanceExtension(ComfyExtension): return [ ByteDanceImageNode, ByteDanceImageEditNode, + ByteDanceSeedreamNode, ByteDanceTextToVideoNode, ByteDanceImageToVideoNode, ByteDanceFirstLastFrameNode, From df34f1549a431c85a6326e87075a206228697cde Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Thu, 11 Sep 2025 05:16:41 +0800 Subject: [PATCH 261/759] Update template to 0.1.78 (#9806) * Update template to 0.1.77 * Update template to 0.1.78 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ea1931d78..d31df0fec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.11 -comfyui-workflow-templates==0.1.76 +comfyui-workflow-templates==0.1.78 comfyui-embedded-docs==0.2.6 torch torchsde From 72212fef660bcd7d9702fa52011d089c027a64d8 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 10 Sep 2025 17:25:41 -0400 Subject: [PATCH 262/759] ComfyUI version 0.3.59 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 37361bd75..ee58205f5 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.58" +__version__ = "0.3.59" diff --git a/pyproject.toml b/pyproject.toml index f02ab9126..a7fc1a5a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.58" +version = "0.3.59" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From e01e99d075852b94e93f27ea64ab862a49a7d2cc Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 10 Sep 2025 20:17:34 -0700 Subject: [PATCH 263/759] Support hunyuan image distilled model. (#9807) --- comfy/ldm/hunyuan_video/model.py | 14 ++++++++++++++ comfy/model_detection.py | 12 ++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py index ca289c5bd..7732182a4 100644 --- a/comfy/ldm/hunyuan_video/model.py +++ b/comfy/ldm/hunyuan_video/model.py @@ -41,6 +41,7 @@ class HunyuanVideoParams: qkv_bias: bool guidance_embed: bool byt5: bool + meanflow: bool class SelfAttentionRef(nn.Module): @@ -256,6 +257,11 @@ class HunyuanVideo(nn.Module): else: self.byt5_in = None + if params.meanflow: + self.time_r_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) + else: + self.time_r_in = None + if final_layer: self.final_layer = LastLayer(self.hidden_size, self.patch_size[-1], self.out_channels, dtype=dtype, device=device, operations=operations) @@ -282,6 +288,14 @@ class HunyuanVideo(nn.Module): img = self.img_in(img) vec = self.time_in(timestep_embedding(timesteps, 256, time_factor=1.0).to(img.dtype)) + if self.time_r_in is not None: + w = torch.where(transformer_options['sigmas'][0] == transformer_options['sample_sigmas'])[0] # This most likely could be improved + if len(w) > 0: + timesteps_r = transformer_options['sample_sigmas'][w[0] + 1] + timesteps_r = timesteps_r.unsqueeze(0).to(device=timesteps.device, dtype=timesteps.dtype) + vec_r = self.time_r_in(timestep_embedding(timesteps_r, 256, time_factor=1000.0).to(img.dtype)) + vec = (vec + vec_r) / 2 + if ref_latent is not None: ref_latent_ids = self.img_ids(ref_latent) ref_latent = self.img_in(ref_latent) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index dbcbe5f5a..fe983cede 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -142,12 +142,20 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["in_channels"] = in_w.shape[1] #SkyReels img2video has 32 input channels dit_config["patch_size"] = list(in_w.shape[2:]) dit_config["out_channels"] = out_w.shape[0] // math.prod(dit_config["patch_size"]) - if '{}vector_in.in_layer.weight'.format(key_prefix) in state_dict: + if any(s.startswith('{}vector_in.'.format(key_prefix)) for s in state_dict_keys): dit_config["vec_in_dim"] = 768 - dit_config["axes_dim"] = [16, 56, 56] else: dit_config["vec_in_dim"] = None + + if len(dit_config["patch_size"]) == 2: dit_config["axes_dim"] = [64, 64] + else: + dit_config["axes_dim"] = [16, 56, 56] + + if any(s.startswith('{}time_r_in.'.format(key_prefix)) for s in state_dict_keys): + dit_config["meanflow"] = True + else: + dit_config["meanflow"] = False dit_config["context_in_dim"] = state_dict['{}txt_in.input_embedder.weight'.format(key_prefix)].shape[1] dit_config["hidden_size"] = in_w.shape[0] From df6850fae8a75126cb7a645e38d58cebcfd51096 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Fri, 12 Sep 2025 02:59:26 +0800 Subject: [PATCH 264/759] Update template to 0.1.81 (#9811) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d31df0fec..0e21967ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.25.11 -comfyui-workflow-templates==0.1.78 +comfyui-workflow-templates==0.1.81 comfyui-embedded-docs==0.2.6 torch torchsde From 18de0b28305fd8bf002d74e91c0630bd76b01d6b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 11 Sep 2025 16:33:02 -0700 Subject: [PATCH 265/759] Fast preview for hunyuan image. (#9814) --- comfy/latent_formats.py | 68 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 859ae8421..f975b5e11 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -538,6 +538,74 @@ class HunyuanImage21(LatentFormat): latent_dimensions = 2 scale_factor = 0.75289 + latent_rgb_factors = [ + [-0.0154, -0.0397, -0.0521], + [ 0.0005, 0.0093, 0.0006], + [-0.0805, -0.0773, -0.0586], + [-0.0494, -0.0487, -0.0498], + [-0.0212, -0.0076, -0.0261], + [-0.0179, -0.0417, -0.0505], + [ 0.0158, 0.0310, 0.0239], + [ 0.0409, 0.0516, 0.0201], + [ 0.0350, 0.0553, 0.0036], + [-0.0447, -0.0327, -0.0479], + [-0.0038, -0.0221, -0.0365], + [-0.0423, -0.0718, -0.0654], + [ 0.0039, 0.0368, 0.0104], + [ 0.0655, 0.0217, 0.0122], + [ 0.0490, 0.1638, 0.2053], + [ 0.0932, 0.0829, 0.0650], + [-0.0186, -0.0209, -0.0135], + [-0.0080, -0.0076, -0.0148], + [-0.0284, -0.0201, 0.0011], + [-0.0642, -0.0294, -0.0777], + [-0.0035, 0.0076, -0.0140], + [ 0.0519, 0.0731, 0.0887], + [-0.0102, 0.0095, 0.0704], + [ 0.0068, 0.0218, -0.0023], + [-0.0726, -0.0486, -0.0519], + [ 0.0260, 0.0295, 0.0263], + [ 0.0250, 0.0333, 0.0341], + [ 0.0168, -0.0120, -0.0174], + [ 0.0226, 0.1037, 0.0114], + [ 0.2577, 0.1906, 0.1604], + [-0.0646, -0.0137, -0.0018], + [-0.0112, 0.0309, 0.0358], + [-0.0347, 0.0146, -0.0481], + [ 0.0234, 0.0179, 0.0201], + [ 0.0157, 0.0313, 0.0225], + [ 0.0423, 0.0675, 0.0524], + [-0.0031, 0.0027, -0.0255], + [ 0.0447, 0.0555, 0.0330], + [-0.0152, 0.0103, 0.0299], + [-0.0755, -0.0489, -0.0635], + [ 0.0853, 0.0788, 0.1017], + [-0.0272, -0.0294, -0.0471], + [ 0.0440, 0.0400, -0.0137], + [ 0.0335, 0.0317, -0.0036], + [-0.0344, -0.0621, -0.0984], + [-0.0127, -0.0630, -0.0620], + [-0.0648, 0.0360, 0.0924], + [-0.0781, -0.0801, -0.0409], + [ 0.0363, 0.0613, 0.0499], + [ 0.0238, 0.0034, 0.0041], + [-0.0135, 0.0258, 0.0310], + [ 0.0614, 0.1086, 0.0589], + [ 0.0428, 0.0350, 0.0205], + [ 0.0153, 0.0173, -0.0018], + [-0.0288, -0.0455, -0.0091], + [ 0.0344, 0.0109, -0.0157], + [-0.0205, -0.0247, -0.0187], + [ 0.0487, 0.0126, 0.0064], + [-0.0220, -0.0013, 0.0074], + [-0.0203, -0.0094, -0.0048], + [-0.0719, 0.0429, -0.0442], + [ 0.1042, 0.0497, 0.0356], + [-0.0659, -0.0578, -0.0280], + [-0.0060, -0.0322, -0.0234]] + + latent_rgb_factors_bias = [0.0007, -0.0256, -0.0206] + class Hunyuan3Dv2(LatentFormat): latent_channels = 64 latent_dimensions = 1 From 33bd9ed9cb941127b335244c6cc0a8cdc1ac1696 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 11 Sep 2025 21:43:20 -0700 Subject: [PATCH 266/759] Implement hunyuan image refiner model. (#9817) --- comfy/latent_formats.py | 5 + comfy/ldm/hunyuan_video/model.py | 11 +- comfy/ldm/hunyuan_video/vae_refiner.py | 268 ++++++++++++++++++++ comfy/ldm/models/autoencoder.py | 6 + comfy/ldm/modules/diffusionmodules/model.py | 10 +- comfy/model_base.py | 20 ++ comfy/sd.py | 17 +- comfy/supported_models.py | 19 +- comfy_extras/nodes_hunyuan.py | 23 ++ 9 files changed, 367 insertions(+), 12 deletions(-) create mode 100644 comfy/ldm/hunyuan_video/vae_refiner.py diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index f975b5e11..894540879 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -606,6 +606,11 @@ class HunyuanImage21(LatentFormat): latent_rgb_factors_bias = [0.0007, -0.0256, -0.0206] +class HunyuanImage21Refiner(LatentFormat): + latent_channels = 64 + latent_dimensions = 3 + scale_factor = 1.03682 + class Hunyuan3Dv2(LatentFormat): latent_channels = 64 latent_dimensions = 1 diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py index 7732182a4..ca86b8bb1 100644 --- a/comfy/ldm/hunyuan_video/model.py +++ b/comfy/ldm/hunyuan_video/model.py @@ -278,6 +278,7 @@ class HunyuanVideo(nn.Module): guidance: Tensor = None, guiding_frame_index=None, ref_latent=None, + disable_time_r=False, control=None, transformer_options={}, ) -> Tensor: @@ -288,7 +289,7 @@ class HunyuanVideo(nn.Module): img = self.img_in(img) vec = self.time_in(timestep_embedding(timesteps, 256, time_factor=1.0).to(img.dtype)) - if self.time_r_in is not None: + if (self.time_r_in is not None) and (not disable_time_r): w = torch.where(transformer_options['sigmas'][0] == transformer_options['sample_sigmas'])[0] # This most likely could be improved if len(w) > 0: timesteps_r = transformer_options['sample_sigmas'][w[0] + 1] @@ -428,14 +429,14 @@ class HunyuanVideo(nn.Module): img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) return repeat(img_ids, "h w c -> b (h w) c", b=bs) - def forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs): + def forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs): return comfy.patcher_extension.WrapperExecutor.new_class_executor( self._forward, self, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) - ).execute(x, timestep, context, y, txt_byt5, guidance, attention_mask, guiding_frame_index, ref_latent, control, transformer_options, **kwargs) + ).execute(x, timestep, context, y, txt_byt5, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs) - def _forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs): + def _forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs): bs = x.shape[0] if len(self.patch_size) == 3: img_ids = self.img_ids(x) @@ -443,5 +444,5 @@ class HunyuanVideo(nn.Module): else: img_ids = self.img_ids_2d(x) txt_ids = torch.zeros((bs, context.shape[1], 2), device=x.device, dtype=x.dtype) - out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, guidance, guiding_frame_index, ref_latent, control=control, transformer_options=transformer_options) + out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options) return out diff --git a/comfy/ldm/hunyuan_video/vae_refiner.py b/comfy/ldm/hunyuan_video/vae_refiner.py new file mode 100644 index 000000000..e3fff9bbe --- /dev/null +++ b/comfy/ldm/hunyuan_video/vae_refiner.py @@ -0,0 +1,268 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d +import comfy.ops +import comfy.ldm.models.autoencoder +ops = comfy.ops.disable_weight_init + +class RMS_norm(nn.Module): + def __init__(self, dim): + super().__init__() + shape = (dim, 1, 1, 1) + self.scale = dim**0.5 + self.gamma = nn.Parameter(torch.empty(shape)) + + def forward(self, x): + return F.normalize(x, dim=1) * self.scale * self.gamma + +class DnSmpl(nn.Module): + def __init__(self, ic, oc, tds=True): + super().__init__() + fct = 2 * 2 * 2 if tds else 1 * 2 * 2 + assert oc % fct == 0 + self.conv = VideoConv3d(ic, oc // fct, kernel_size=3) + + self.tds = tds + self.gs = fct * ic // oc + + def forward(self, x): + r1 = 2 if self.tds else 1 + h = self.conv(x) + + if self.tds: + hf = h[:, :, :1, :, :] + b, c, f, ht, wd = hf.shape + hf = hf.reshape(b, c, f, ht // 2, 2, wd // 2, 2) + hf = hf.permute(0, 4, 6, 1, 2, 3, 5) + hf = hf.reshape(b, 2 * 2 * c, f, ht // 2, wd // 2) + hf = torch.cat([hf, hf], dim=1) + + hn = h[:, :, 1:, :, :] + b, c, frms, ht, wd = hn.shape + nf = frms // r1 + hn = hn.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2) + hn = hn.permute(0, 3, 5, 7, 1, 2, 4, 6) + hn = hn.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2) + + h = torch.cat([hf, hn], dim=2) + + xf = x[:, :, :1, :, :] + b, ci, f, ht, wd = xf.shape + xf = xf.reshape(b, ci, f, ht // 2, 2, wd // 2, 2) + xf = xf.permute(0, 4, 6, 1, 2, 3, 5) + xf = xf.reshape(b, 2 * 2 * ci, f, ht // 2, wd // 2) + B, C, T, H, W = xf.shape + xf = xf.view(B, h.shape[1], self.gs // 2, T, H, W).mean(dim=2) + + xn = x[:, :, 1:, :, :] + b, ci, frms, ht, wd = xn.shape + nf = frms // r1 + xn = xn.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2) + xn = xn.permute(0, 3, 5, 7, 1, 2, 4, 6) + xn = xn.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2) + B, C, T, H, W = xn.shape + xn = xn.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2) + sc = torch.cat([xf, xn], dim=2) + else: + b, c, frms, ht, wd = h.shape + nf = frms // r1 + h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2) + h = h.permute(0, 3, 5, 7, 1, 2, 4, 6) + h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2) + + b, ci, frms, ht, wd = x.shape + nf = frms // r1 + sc = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2) + sc = sc.permute(0, 3, 5, 7, 1, 2, 4, 6) + sc = sc.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2) + B, C, T, H, W = sc.shape + sc = sc.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2) + + return h + sc + + +class UpSmpl(nn.Module): + def __init__(self, ic, oc, tus=True): + super().__init__() + fct = 2 * 2 * 2 if tus else 1 * 2 * 2 + self.conv = VideoConv3d(ic, oc * fct, kernel_size=3) + + self.tus = tus + self.rp = fct * oc // ic + + def forward(self, x): + r1 = 2 if self.tus else 1 + h = self.conv(x) + + if self.tus: + hf = h[:, :, :1, :, :] + b, c, f, ht, wd = hf.shape + nc = c // (2 * 2) + hf = hf.reshape(b, 2, 2, nc, f, ht, wd) + hf = hf.permute(0, 3, 4, 5, 1, 6, 2) + hf = hf.reshape(b, nc, f, ht * 2, wd * 2) + hf = hf[:, : hf.shape[1] // 2] + + hn = h[:, :, 1:, :, :] + b, c, frms, ht, wd = hn.shape + nc = c // (r1 * 2 * 2) + hn = hn.reshape(b, r1, 2, 2, nc, frms, ht, wd) + hn = hn.permute(0, 4, 5, 1, 6, 2, 7, 3) + hn = hn.reshape(b, nc, frms * r1, ht * 2, wd * 2) + + h = torch.cat([hf, hn], dim=2) + + xf = x[:, :, :1, :, :] + b, ci, f, ht, wd = xf.shape + xf = xf.repeat_interleave(repeats=self.rp // 2, dim=1) + b, c, f, ht, wd = xf.shape + nc = c // (2 * 2) + xf = xf.reshape(b, 2, 2, nc, f, ht, wd) + xf = xf.permute(0, 3, 4, 5, 1, 6, 2) + xf = xf.reshape(b, nc, f, ht * 2, wd * 2) + + xn = x[:, :, 1:, :, :] + xn = xn.repeat_interleave(repeats=self.rp, dim=1) + b, c, frms, ht, wd = xn.shape + nc = c // (r1 * 2 * 2) + xn = xn.reshape(b, r1, 2, 2, nc, frms, ht, wd) + xn = xn.permute(0, 4, 5, 1, 6, 2, 7, 3) + xn = xn.reshape(b, nc, frms * r1, ht * 2, wd * 2) + sc = torch.cat([xf, xn], dim=2) + else: + b, c, frms, ht, wd = h.shape + nc = c // (r1 * 2 * 2) + h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd) + h = h.permute(0, 4, 5, 1, 6, 2, 7, 3) + h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2) + + sc = x.repeat_interleave(repeats=self.rp, dim=1) + b, c, frms, ht, wd = sc.shape + nc = c // (r1 * 2 * 2) + sc = sc.reshape(b, r1, 2, 2, nc, frms, ht, wd) + sc = sc.permute(0, 4, 5, 1, 6, 2, 7, 3) + sc = sc.reshape(b, nc, frms * r1, ht * 2, wd * 2) + + return h + sc + +class Encoder(nn.Module): + def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks, + ffactor_spatial, ffactor_temporal, downsample_match_channel=True, **_): + super().__init__() + self.z_channels = z_channels + self.block_out_channels = block_out_channels + self.num_res_blocks = num_res_blocks + self.conv_in = VideoConv3d(in_channels, block_out_channels[0], 3, 1, 1) + + self.down = nn.ModuleList() + ch = block_out_channels[0] + depth = (ffactor_spatial >> 1).bit_length() + depth_temporal = ((ffactor_spatial // ffactor_temporal) >> 1).bit_length() + + for i, tgt in enumerate(block_out_channels): + stage = nn.Module() + stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, + out_channels=tgt, + temb_channels=0, + conv_op=VideoConv3d, norm_op=RMS_norm) + for j in range(num_res_blocks)]) + ch = tgt + if i < depth: + nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and downsample_match_channel else ch + stage.downsample = DnSmpl(ch, nxt, tds=i >= depth_temporal) + ch = nxt + self.down.append(stage) + + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) + self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=RMS_norm) + self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) + + self.norm_out = RMS_norm(ch) + self.conv_out = VideoConv3d(ch, z_channels << 1, 3, 1, 1) + + self.regul = comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer() + + def forward(self, x): + x = x.unsqueeze(2) + x = self.conv_in(x) + + for stage in self.down: + for blk in stage.block: + x = blk(x) + if hasattr(stage, 'downsample'): + x = stage.downsample(x) + + x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x))) + + b, c, t, h, w = x.shape + grp = c // (self.z_channels << 1) + skip = x.view(b, c // grp, grp, t, h, w).mean(2) + + out = self.conv_out(F.silu(self.norm_out(x))) + skip + out = self.regul(out)[0] + + out = torch.cat((out[:, :, :1], out), dim=2) + out = out.permute(0, 2, 1, 3, 4) + b, f_times_2, c, h, w = out.shape + out = out.reshape(b, f_times_2 // 2, 2 * c, h, w) + out = out.permute(0, 2, 1, 3, 4).contiguous() + return out + +class Decoder(nn.Module): + def __init__(self, z_channels, out_channels, block_out_channels, num_res_blocks, + ffactor_spatial, ffactor_temporal, upsample_match_channel=True, **_): + super().__init__() + block_out_channels = block_out_channels[::-1] + self.z_channels = z_channels + self.block_out_channels = block_out_channels + self.num_res_blocks = num_res_blocks + + ch = block_out_channels[0] + self.conv_in = VideoConv3d(z_channels, ch, 3) + + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) + self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=RMS_norm) + self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) + + self.up = nn.ModuleList() + depth = (ffactor_spatial >> 1).bit_length() + depth_temporal = (ffactor_temporal >> 1).bit_length() + + for i, tgt in enumerate(block_out_channels): + stage = nn.Module() + stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, + out_channels=tgt, + temb_channels=0, + conv_op=VideoConv3d, norm_op=RMS_norm) + for j in range(num_res_blocks + 1)]) + ch = tgt + if i < depth: + nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and upsample_match_channel else ch + stage.upsample = UpSmpl(ch, nxt, tus=i < depth_temporal) + ch = nxt + self.up.append(stage) + + self.norm_out = RMS_norm(ch) + self.conv_out = VideoConv3d(ch, out_channels, 3) + + def forward(self, z): + z = z.permute(0, 2, 1, 3, 4) + b, f, c, h, w = z.shape + z = z.reshape(b, f, 2, c // 2, h, w) + z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w) + z = z.permute(0, 2, 1, 3, 4) + z = z[:, :, 1:] + + x = self.conv_in(z) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1) + x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x))) + + for stage in self.up: + for blk in stage.block: + x = blk(x) + if hasattr(stage, 'upsample'): + x = stage.upsample(x) + + return self.conv_out(F.silu(self.norm_out(x))) diff --git a/comfy/ldm/models/autoencoder.py b/comfy/ldm/models/autoencoder.py index 13bd6e16b..611d36a1b 100644 --- a/comfy/ldm/models/autoencoder.py +++ b/comfy/ldm/models/autoencoder.py @@ -26,6 +26,12 @@ class DiagonalGaussianRegularizer(torch.nn.Module): z = posterior.mode() return z, None +class EmptyRegularizer(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]: + return z, None class AbstractAutoencoder(torch.nn.Module): """ diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index 8f598a848..4245eedca 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -145,7 +145,7 @@ class Downsample(nn.Module): class ResnetBlock(nn.Module): def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False, - dropout=0.0, temb_channels=512, conv_op=ops.Conv2d): + dropout=0.0, temb_channels=512, conv_op=ops.Conv2d, norm_op=Normalize): super().__init__() self.in_channels = in_channels out_channels = in_channels if out_channels is None else out_channels @@ -153,7 +153,7 @@ class ResnetBlock(nn.Module): self.use_conv_shortcut = conv_shortcut self.swish = torch.nn.SiLU(inplace=True) - self.norm1 = Normalize(in_channels) + self.norm1 = norm_op(in_channels) self.conv1 = conv_op(in_channels, out_channels, kernel_size=3, @@ -162,7 +162,7 @@ class ResnetBlock(nn.Module): if temb_channels > 0: self.temb_proj = ops.Linear(temb_channels, out_channels) - self.norm2 = Normalize(out_channels) + self.norm2 = norm_op(out_channels) self.dropout = torch.nn.Dropout(dropout, inplace=True) self.conv2 = conv_op(out_channels, out_channels, @@ -305,11 +305,11 @@ def vae_attention(): return normal_attention class AttnBlock(nn.Module): - def __init__(self, in_channels, conv_op=ops.Conv2d): + def __init__(self, in_channels, conv_op=ops.Conv2d, norm_op=Normalize): super().__init__() self.in_channels = in_channels - self.norm = Normalize(in_channels) + self.norm = norm_op(in_channels) self.q = conv_op(in_channels, in_channels, kernel_size=1, diff --git a/comfy/model_base.py b/comfy/model_base.py index 993ff65e6..c69a9d1ad 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1432,3 +1432,23 @@ class HunyuanImage21(BaseModel): out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance])) return out + +class HunyuanImage21Refiner(HunyuanImage21): + def concat_cond(self, **kwargs): + noise = kwargs.get("noise", None) + image = kwargs.get("concat_latent_image", None) + device = kwargs["device"] + + if image is None: + shape_image = list(noise.shape) + image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device) + else: + image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") + image = self.process_latent_in(image) + image = utils.resize_to_batch_size(image, noise.shape[0]) + return image + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + out['disable_time_r'] = comfy.conds.CONDConstant(True) + return out diff --git a/comfy/sd.py b/comfy/sd.py index 9dd9a74d4..02ddc7239 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -285,6 +285,7 @@ class VAE: self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0) self.working_dtypes = [torch.bfloat16, torch.float32] self.disable_offload = False + self.not_video = False self.downscale_index_formula = None self.upscale_index_formula = None @@ -409,6 +410,20 @@ class VAE: self.downscale_ratio = (lambda a: max(0, math.floor((a + 7) / 8)), 32, 32) self.downscale_index_formula = (8, 32, 32) self.working_dtypes = [torch.bfloat16, torch.float32] + elif "decoder.conv_in.conv.weight" in sd and sd['decoder.conv_in.conv.weight'].shape[1] == 32: + ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True} + self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1] + self.downscale_ratio = 16 + self.upscale_ratio = 16 + self.latent_dim = 3 + self.not_video = True + self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] + self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.EmptyRegularizer"}, + encoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Encoder", 'params': ddconfig}, + decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig}) + + self.memory_used_encode = lambda shape, dtype: (1400 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (1400 * shape[-3] * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype) elif "decoder.conv_in.conv.weight" in sd: ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0} ddconfig["conv3d"] = True @@ -669,7 +684,7 @@ class VAE: self.throw_exception_if_invalid() pixel_samples = self.vae_encode_crop_pixels(pixel_samples) pixel_samples = pixel_samples.movedim(-1, 1) - if self.latent_dim == 3 and pixel_samples.ndim < 5: + if not self.not_video and self.latent_dim == 3 and pixel_samples.ndim < 5: pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) try: memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index aa953b462..ba1b8c313 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1321,6 +1321,23 @@ class HunyuanImage21(HunyuanVideo): hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] +class HunyuanImage21Refiner(HunyuanVideo): + unet_config = { + "image_model": "hunyuan_video", + "patch_size": [1, 1, 1], + "vec_in_dim": None, + } + + sampling_settings = { + "shift": 1.0, + } + + latent_format = latent_formats.HunyuanImage21Refiner + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.HunyuanImage21Refiner(self, device=device) + return out + +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index ce031ceb2..351a7e2cb 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -128,6 +128,28 @@ class EmptyHunyuanImageLatent: latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device()) return ({"samples":latent}, ) +class HunyuanRefinerLatent: + @classmethod + def INPUT_TYPES(s): + return {"required": {"positive": ("CONDITIONING", ), + "negative": ("CONDITIONING", ), + "latent": ("LATENT", ), + }} + + RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") + RETURN_NAMES = ("positive", "negative", "latent") + + FUNCTION = "execute" + + def execute(self, positive, negative, latent): + latent = latent["samples"] + + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent}) + out_latent = {} + out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) + return (positive, negative, out_latent) + NODE_CLASS_MAPPINGS = { "CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT, @@ -135,4 +157,5 @@ NODE_CLASS_MAPPINGS = { "EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo, "HunyuanImageToVideo": HunyuanImageToVideo, "EmptyHunyuanImageLatent": EmptyHunyuanImageLatent, + "HunyuanRefinerLatent": HunyuanRefinerLatent, } From 15ec9ea958d1c5d374add598b571a585541d4863 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Thu, 11 Sep 2025 21:44:20 -0700 Subject: [PATCH 267/759] Add Output to V3 Combo type to match what is possible with V1 (#9813) --- comfy_api/latest/_io.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index f770109d5..4826818df 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -331,7 +331,7 @@ class String(ComfyTypeIO): }) @comfytype(io_type="COMBO") -class Combo(ComfyTypeI): +class Combo(ComfyTypeIO): Type = str class Input(WidgetInput): """Combo input (dropdown).""" @@ -360,6 +360,14 @@ class Combo(ComfyTypeI): "remote": self.remote.as_dict() if self.remote else None, }) + class Output(Output): + def __init__(self, id: str=None, display_name: str=None, options: list[str]=None, tooltip: str=None, is_output_list=False): + super().__init__(id, display_name, tooltip, is_output_list) + self.options = options if options is not None else [] + + @property + def io_type(self): + return self.options @comfytype(io_type="COMBO") class MultiCombo(ComfyTypeI): From d6b977b2e680e98ad18a37ee13783da4f30e15f4 Mon Sep 17 00:00:00 2001 From: Benjamin Lu Date: Thu, 11 Sep 2025 21:46:01 -0700 Subject: [PATCH 268/759] Bump frontend to 1.26.11 (#9809) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0e21967ef..de5af5fac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.25.11 +comfyui-frontend-package==1.26.11 comfyui-workflow-templates==0.1.81 comfyui-embedded-docs==0.2.6 torch From fd2b820ec28e9575877dc6c51949b2d28dc78894 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 12 Sep 2025 13:03:08 -0700 Subject: [PATCH 269/759] Add noise augmentation to hunyuan image refiner. (#9831) This was missing and should help with colors being blown out. --- comfy/model_base.py | 4 ++++ comfy_extras/nodes_hunyuan.py | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index c69a9d1ad..8422051bf 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1437,6 +1437,7 @@ class HunyuanImage21Refiner(HunyuanImage21): def concat_cond(self, **kwargs): noise = kwargs.get("noise", None) image = kwargs.get("concat_latent_image", None) + noise_augmentation = kwargs.get("noise_augmentation", 0.0) device = kwargs["device"] if image is None: @@ -1446,6 +1447,9 @@ class HunyuanImage21Refiner(HunyuanImage21): image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") image = self.process_latent_in(image) image = utils.resize_to_batch_size(image, noise.shape[0]) + if noise_augmentation > 0: + noise = torch.randn(image.shape, generator=torch.manual_seed(kwargs.get("seed", 0) - 10), dtype=image.dtype, device="cpu").to(image.device) + image = noise_augmentation * noise + (1.0 - noise_augmentation) * image return image def extra_conds(self, **kwargs): diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index 351a7e2cb..db398cdf1 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -134,6 +134,7 @@ class HunyuanRefinerLatent: return {"required": {"positive": ("CONDITIONING", ), "negative": ("CONDITIONING", ), "latent": ("LATENT", ), + "noise_augmentation": ("FLOAT", {"default": 0.10, "min": 0.0, "max": 1.0, "step": 0.01}), }} RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") @@ -141,11 +142,10 @@ class HunyuanRefinerLatent: FUNCTION = "execute" - def execute(self, positive, negative, latent): + def execute(self, positive, negative, latent, noise_augmentation): latent = latent["samples"] - - positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent}) - negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent}) + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation}) out_latent = {} out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) return (positive, negative, out_latent) From e600520f8aa583c79caa286a8d7d584edc3d059b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 12 Sep 2025 13:35:34 -0700 Subject: [PATCH 270/759] Fix hunyuan refiner blownout colors at noise aug less than 0.25 (#9832) --- comfy/model_base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 8422051bf..4176bca25 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1449,7 +1449,9 @@ class HunyuanImage21Refiner(HunyuanImage21): image = utils.resize_to_batch_size(image, noise.shape[0]) if noise_augmentation > 0: noise = torch.randn(image.shape, generator=torch.manual_seed(kwargs.get("seed", 0) - 10), dtype=image.dtype, device="cpu").to(image.device) - image = noise_augmentation * noise + (1.0 - noise_augmentation) * image + image = noise_augmentation * noise + min(1.0 - noise_augmentation, 0.75) * image + else: + image = 0.75 * image return image def extra_conds(self, **kwargs): From 7757d5a657cbe9b22d1f3538ee0bc5387d3f5459 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 12 Sep 2025 13:40:12 -0700 Subject: [PATCH 271/759] Set default hunyuan refiner shift to 4.0 (#9833) --- comfy/supported_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index ba1b8c313..472ea0ae9 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1329,7 +1329,7 @@ class HunyuanImage21Refiner(HunyuanVideo): } sampling_settings = { - "shift": 1.0, + "shift": 4.0, } latent_format = latent_formats.HunyuanImage21Refiner From 0aa074a420c450fd7793d83c6f8d66009a1ca2a2 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 13 Sep 2025 00:29:03 +0300 Subject: [PATCH 272/759] add kling-v2-1 model to the KlingStartEndFrame node (#9630) --- comfy_api_nodes/nodes_kling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 9fa390985..5f55b2cc9 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -846,6 +846,8 @@ class KlingStartEndFrameNode(KlingImage2VideoNode): "pro mode / 10s duration / kling-v1-5": ("pro", "10", "kling-v1-5"), "pro mode / 5s duration / kling-v1-6": ("pro", "5", "kling-v1-6"), "pro mode / 10s duration / kling-v1-6": ("pro", "10", "kling-v1-6"), + "pro mode / 5s duration / kling-v2-1": ("pro", "5", "kling-v2-1"), + "pro mode / 10s duration / kling-v2-1": ("pro", "10", "kling-v2-1"), } @classmethod From 45bc1f5c00307f3e85871ecfb46acaa2365b0096 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 13 Sep 2025 00:37:31 +0300 Subject: [PATCH 273/759] convert Minimax API nodes to the V3 schema (#9693) --- comfy_api_nodes/nodes_minimax.py | 732 ++++++++++++++++--------------- 1 file changed, 378 insertions(+), 354 deletions(-) diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index bb3c9e710..bf560661c 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -1,9 +1,10 @@ from inspect import cleandoc -from typing import Union +from typing import Optional import logging import torch -from comfy.comfy_types.node_typing import IO +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io as comfy_io from comfy_api.input_impl.video_types import VideoFromFile from comfy_api_nodes.apis import ( MinimaxVideoGenerationRequest, @@ -11,7 +12,7 @@ from comfy_api_nodes.apis import ( MinimaxFileRetrieveResponse, MinimaxTaskResultResponse, SubjectReferenceItem, - MiniMaxModel + MiniMaxModel, ) from comfy_api_nodes.apis.client import ( ApiEndpoint, @@ -31,372 +32,398 @@ from server import PromptServer I2V_AVERAGE_DURATION = 114 T2V_AVERAGE_DURATION = 234 -class MinimaxTextToVideoNode: + +async def _generate_mm_video( + *, + auth: dict[str, str], + node_id: str, + prompt_text: str, + seed: int, + model: str, + image: Optional[torch.Tensor] = None, # used for ImageToVideo + subject: Optional[torch.Tensor] = None, # used for SubjectToVideo + average_duration: Optional[int] = None, +) -> comfy_io.NodeOutput: + if image is None: + validate_string(prompt_text, field_name="prompt_text") + # upload image, if passed in + image_url = None + if image is not None: + image_url = (await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=auth))[0] + + # TODO: figure out how to deal with subject properly, API returns invalid params when using S2V-01 model + subject_reference = None + if subject is not None: + subject_url = (await upload_images_to_comfyapi(subject, max_images=1, auth_kwargs=auth))[0] + subject_reference = [SubjectReferenceItem(image=subject_url)] + + + video_generate_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/minimax/video_generation", + method=HttpMethod.POST, + request_model=MinimaxVideoGenerationRequest, + response_model=MinimaxVideoGenerationResponse, + ), + request=MinimaxVideoGenerationRequest( + model=MiniMaxModel(model), + prompt=prompt_text, + callback_url=None, + first_frame_image=image_url, + subject_reference=subject_reference, + prompt_optimizer=None, + ), + auth_kwargs=auth, + ) + response = await video_generate_operation.execute() + + task_id = response.task_id + if not task_id: + raise Exception(f"MiniMax generation failed: {response.base_resp}") + + video_generate_operation = PollingOperation( + poll_endpoint=ApiEndpoint( + path="/proxy/minimax/query/video_generation", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=MinimaxTaskResultResponse, + query_params={"task_id": task_id}, + ), + completed_statuses=["Success"], + failed_statuses=["Fail"], + status_extractor=lambda x: x.status.value, + estimated_duration=average_duration, + node_id=node_id, + auth_kwargs=auth, + ) + task_result = await video_generate_operation.execute() + + file_id = task_result.file_id + if file_id is None: + raise Exception("Request was not successful. Missing file ID.") + file_retrieve_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/minimax/files/retrieve", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=MinimaxFileRetrieveResponse, + query_params={"file_id": int(file_id)}, + ), + request=EmptyRequest(), + auth_kwargs=auth, + ) + file_result = await file_retrieve_operation.execute() + + file_url = file_result.file.download_url + if file_url is None: + raise Exception( + f"No video was found in the response. Full response: {file_result.model_dump()}" + ) + logging.info("Generated video URL: %s", file_url) + if node_id: + if hasattr(file_result.file, "backup_download_url"): + message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}" + else: + message = f"Result URL: {file_url}" + PromptServer.instance.send_progress_text(message, node_id) + + # Download and return as VideoFromFile + video_io = await download_url_to_bytesio(file_url) + if video_io is None: + error_msg = f"Failed to download video from {file_url}" + logging.error(error_msg) + raise Exception(error_msg) + return comfy_io.NodeOutput(VideoFromFile(video_io)) + + +class MinimaxTextToVideoNode(comfy_io.ComfyNode): """ Generates videos synchronously based on a prompt, and optional parameters using MiniMax's API. """ - AVERAGE_DURATION = T2V_AVERAGE_DURATION + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="MinimaxTextToVideoNode", + display_name="MiniMax Text to Video", + category="api node/video/MiniMax", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt_text", + multiline=True, + default="", + tooltip="Text prompt to guide the video generation", + ), + comfy_io.Combo.Input( + "model", + options=["T2V-01", "T2V-01-Director"], + default="T2V-01", + tooltip="Model to use for video generation", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + step=1, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + optional=True, + ), + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt_text": ( - "STRING", - { - "multiline": True, - "default": "", - "tooltip": "Text prompt to guide the video generation", - }, - ), - "model": ( - [ - "T2V-01", - "T2V-01-Director", - ], - { - "default": "T2V-01", - "tooltip": "Model to use for video generation", - }, - ), + async def execute( + cls, + prompt_text: str, + model: str = "T2V-01", + seed: int = 0, + ) -> comfy_io.NodeOutput: + return await _generate_mm_video( + auth={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, }, - "optional": { - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = ("VIDEO",) - DESCRIPTION = "Generates videos from prompts using MiniMax's API" - FUNCTION = "generate_video" - CATEGORY = "api node/video/MiniMax" - API_NODE = True - - async def generate_video( - self, - prompt_text, - seed=0, - model="T2V-01", - image: torch.Tensor=None, # used for ImageToVideo - subject: torch.Tensor=None, # used for SubjectToVideo - unique_id: Union[str, None]=None, - **kwargs, - ): - ''' - Function used between MiniMax nodes - supports T2V, I2V, and S2V, based on provided arguments. - ''' - if image is None: - validate_string(prompt_text, field_name="prompt_text") - # upload image, if passed in - image_url = None - if image is not None: - image_url = (await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs))[0] - - # TODO: figure out how to deal with subject properly, API returns invalid params when using S2V-01 model - subject_reference = None - if subject is not None: - subject_url = (await upload_images_to_comfyapi(subject, max_images=1, auth_kwargs=kwargs))[0] - subject_reference = [SubjectReferenceItem(image=subject_url)] - - - video_generate_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/minimax/video_generation", - method=HttpMethod.POST, - request_model=MinimaxVideoGenerationRequest, - response_model=MinimaxVideoGenerationResponse, - ), - request=MinimaxVideoGenerationRequest( - model=MiniMaxModel(model), - prompt=prompt_text, - callback_url=None, - first_frame_image=image_url, - subject_reference=subject_reference, - prompt_optimizer=None, - ), - auth_kwargs=kwargs, + node_id=cls.hidden.unique_id, + prompt_text=prompt_text, + seed=seed, + model=model, + image=None, + subject=None, + average_duration=T2V_AVERAGE_DURATION, ) - response = await video_generate_operation.execute() - - task_id = response.task_id - if not task_id: - raise Exception(f"MiniMax generation failed: {response.base_resp}") - - video_generate_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path="/proxy/minimax/query/video_generation", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MinimaxTaskResultResponse, - query_params={"task_id": task_id}, - ), - completed_statuses=["Success"], - failed_statuses=["Fail"], - status_extractor=lambda x: x.status.value, - estimated_duration=self.AVERAGE_DURATION, - node_id=unique_id, - auth_kwargs=kwargs, - ) - task_result = await video_generate_operation.execute() - - file_id = task_result.file_id - if file_id is None: - raise Exception("Request was not successful. Missing file ID.") - file_retrieve_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/minimax/files/retrieve", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MinimaxFileRetrieveResponse, - query_params={"file_id": int(file_id)}, - ), - request=EmptyRequest(), - auth_kwargs=kwargs, - ) - file_result = await file_retrieve_operation.execute() - - file_url = file_result.file.download_url - if file_url is None: - raise Exception( - f"No video was found in the response. Full response: {file_result.model_dump()}" - ) - logging.info(f"Generated video URL: {file_url}") - if unique_id: - if hasattr(file_result.file, "backup_download_url"): - message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}" - else: - message = f"Result URL: {file_url}" - PromptServer.instance.send_progress_text(message, unique_id) - - video_io = await download_url_to_bytesio(file_url) - if video_io is None: - error_msg = f"Failed to download video from {file_url}" - logging.error(error_msg) - raise Exception(error_msg) - return (VideoFromFile(video_io),) -class MinimaxImageToVideoNode(MinimaxTextToVideoNode): +class MinimaxImageToVideoNode(comfy_io.ComfyNode): """ Generates videos synchronously based on an image and prompt, and optional parameters using MiniMax's API. """ - AVERAGE_DURATION = I2V_AVERAGE_DURATION + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="MinimaxImageToVideoNode", + display_name="MiniMax Image to Video", + category="api node/video/MiniMax", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input( + "image", + tooltip="Image to use as first frame of video generation", + ), + comfy_io.String.Input( + "prompt_text", + multiline=True, + default="", + tooltip="Text prompt to guide the video generation", + ), + comfy_io.Combo.Input( + "model", + options=["I2V-01-Director", "I2V-01", "I2V-01-live"], + default="I2V-01", + tooltip="Model to use for video generation", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + step=1, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + optional=True, + ), + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ( - IO.IMAGE, - { - "tooltip": "Image to use as first frame of video generation" - }, - ), - "prompt_text": ( - "STRING", - { - "multiline": True, - "default": "", - "tooltip": "Text prompt to guide the video generation", - }, - ), - "model": ( - [ - "I2V-01-Director", - "I2V-01", - "I2V-01-live", - ], - { - "default": "I2V-01", - "tooltip": "Model to use for video generation", - }, - ), + async def execute( + cls, + image: torch.Tensor, + prompt_text: str, + model: str = "I2V-01", + seed: int = 0, + ) -> comfy_io.NodeOutput: + return await _generate_mm_video( + auth={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, }, - "optional": { - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = ("VIDEO",) - DESCRIPTION = "Generates videos from an image and prompts using MiniMax's API" - FUNCTION = "generate_video" - CATEGORY = "api node/video/MiniMax" - API_NODE = True + node_id=cls.hidden.unique_id, + prompt_text=prompt_text, + seed=seed, + model=model, + image=image, + subject=None, + average_duration=I2V_AVERAGE_DURATION, + ) -class MinimaxSubjectToVideoNode(MinimaxTextToVideoNode): +class MinimaxSubjectToVideoNode(comfy_io.ComfyNode): """ Generates videos synchronously based on an image and prompt, and optional parameters using MiniMax's API. """ - AVERAGE_DURATION = T2V_AVERAGE_DURATION + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="MinimaxSubjectToVideoNode", + display_name="MiniMax Subject to Video", + category="api node/video/MiniMax", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input( + "subject", + tooltip="Image of subject to reference for video generation", + ), + comfy_io.String.Input( + "prompt_text", + multiline=True, + default="", + tooltip="Text prompt to guide the video generation", + ), + comfy_io.Combo.Input( + "model", + options=["S2V-01"], + default="S2V-01", + tooltip="Model to use for video generation", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + step=1, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + optional=True, + ), + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "subject": ( - IO.IMAGE, - { - "tooltip": "Image of subject to reference video generation" - }, - ), - "prompt_text": ( - "STRING", - { - "multiline": True, - "default": "", - "tooltip": "Text prompt to guide the video generation", - }, - ), - "model": ( - [ - "S2V-01", - ], - { - "default": "S2V-01", - "tooltip": "Model to use for video generation", - }, - ), + async def execute( + cls, + subject: torch.Tensor, + prompt_text: str, + model: str = "S2V-01", + seed: int = 0, + ) -> comfy_io.NodeOutput: + return await _generate_mm_video( + auth={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, }, - "optional": { - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = ("VIDEO",) - DESCRIPTION = "Generates videos from an image and prompts using MiniMax's API" - FUNCTION = "generate_video" - CATEGORY = "api node/video/MiniMax" - API_NODE = True + node_id=cls.hidden.unique_id, + prompt_text=prompt_text, + seed=seed, + model=model, + image=None, + subject=subject, + average_duration=T2V_AVERAGE_DURATION, + ) -class MinimaxHailuoVideoNode: +class MinimaxHailuoVideoNode(comfy_io.ComfyNode): """Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt_text": ( - "STRING", - { - "multiline": True, - "default": "", - "tooltip": "Text prompt to guide the video generation.", - }, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="MinimaxHailuoVideoNode", + display_name="MiniMax Hailuo Video", + category="api node/video/MiniMax", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt_text", + multiline=True, + default="", + tooltip="Text prompt to guide the video generation.", ), - }, - "optional": { - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + step=1, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + optional=True, ), - "first_frame_image": ( - IO.IMAGE, - { - "tooltip": "Optional image to use as the first frame to generate a video." - }, + comfy_io.Image.Input( + "first_frame_image", + tooltip="Optional image to use as the first frame to generate a video.", + optional=True, ), - "prompt_optimizer": ( - IO.BOOLEAN, - { - "tooltip": "Optimize prompt to improve generation quality when needed.", - "default": True, - }, + comfy_io.Boolean.Input( + "prompt_optimizer", + default=True, + tooltip="Optimize prompt to improve generation quality when needed.", + optional=True, ), - "duration": ( - IO.COMBO, - { - "tooltip": "The length of the output video in seconds.", - "default": 6, - "options": [6, 10], - }, + comfy_io.Combo.Input( + "duration", + options=[6, 10], + default=6, + tooltip="The length of the output video in seconds.", + optional=True, ), - "resolution": ( - IO.COMBO, - { - "tooltip": "The dimensions of the video display. " - "1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels.", - "default": "768P", - "options": ["768P", "1080P"], - }, + comfy_io.Combo.Input( + "resolution", + options=["768P", "1080P"], + default="768P", + tooltip="The dimensions of the video display. 1080p is 1920x1080, 768p is 1366x768.", + optional=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + prompt_text: str, + seed: int = 0, + first_frame_image: Optional[torch.Tensor] = None, # used for ImageToVideo + prompt_optimizer: bool = True, + duration: int = 6, + resolution: str = "768P", + model: str = "MiniMax-Hailuo-02", + ) -> comfy_io.NodeOutput: + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, } - - RETURN_TYPES = ("VIDEO",) - DESCRIPTION = cleandoc(__doc__ or "") - FUNCTION = "generate_video" - CATEGORY = "api node/video/MiniMax" - API_NODE = True - - async def generate_video( - self, - prompt_text, - seed=0, - first_frame_image: torch.Tensor=None, # used for ImageToVideo - prompt_optimizer=True, - duration=6, - resolution="768P", - model="MiniMax-Hailuo-02", - unique_id: Union[str, None]=None, - **kwargs, - ): if first_frame_image is None: validate_string(prompt_text, field_name="prompt_text") @@ -408,7 +435,7 @@ class MinimaxHailuoVideoNode: # upload image, if passed in image_url = None if first_frame_image is not None: - image_url = (await upload_images_to_comfyapi(first_frame_image, max_images=1, auth_kwargs=kwargs))[0] + image_url = (await upload_images_to_comfyapi(first_frame_image, max_images=1, auth_kwargs=auth))[0] video_generate_operation = SynchronousOperation( endpoint=ApiEndpoint( @@ -426,7 +453,7 @@ class MinimaxHailuoVideoNode: duration=duration, resolution=resolution, ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) response = await video_generate_operation.execute() @@ -447,8 +474,8 @@ class MinimaxHailuoVideoNode: failed_statuses=["Fail"], status_extractor=lambda x: x.status.value, estimated_duration=average_duration, - node_id=unique_id, - auth_kwargs=kwargs, + node_id=cls.hidden.unique_id, + auth_kwargs=auth, ) task_result = await video_generate_operation.execute() @@ -464,7 +491,7 @@ class MinimaxHailuoVideoNode: query_params={"file_id": int(file_id)}, ), request=EmptyRequest(), - auth_kwargs=kwargs, + auth_kwargs=auth, ) file_result = await file_retrieve_operation.execute() @@ -474,34 +501,31 @@ class MinimaxHailuoVideoNode: f"No video was found in the response. Full response: {file_result.model_dump()}" ) logging.info(f"Generated video URL: {file_url}") - if unique_id: + if cls.hidden.unique_id: if hasattr(file_result.file, "backup_download_url"): message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}" else: message = f"Result URL: {file_url}" - PromptServer.instance.send_progress_text(message, unique_id) + PromptServer.instance.send_progress_text(message, cls.hidden.unique_id) video_io = await download_url_to_bytesio(file_url) if video_io is None: error_msg = f"Failed to download video from {file_url}" logging.error(error_msg) raise Exception(error_msg) - return (VideoFromFile(video_io),) + return comfy_io.NodeOutput(VideoFromFile(video_io)) -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "MinimaxTextToVideoNode": MinimaxTextToVideoNode, - "MinimaxImageToVideoNode": MinimaxImageToVideoNode, - # "MinimaxSubjectToVideoNode": MinimaxSubjectToVideoNode, - "MinimaxHailuoVideoNode": MinimaxHailuoVideoNode, -} +class MinimaxExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + MinimaxTextToVideoNode, + MinimaxImageToVideoNode, + # MinimaxSubjectToVideoNode, + MinimaxHailuoVideoNode, + ] -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "MinimaxTextToVideoNode": "MiniMax Text to Video", - "MinimaxImageToVideoNode": "MiniMax Image to Video", - "MinimaxSubjectToVideoNode": "MiniMax Subject to Video", - "MinimaxHailuoVideoNode": "MiniMax Hailuo Video", -} + +async def comfy_entrypoint() -> MinimaxExtension: + return MinimaxExtension() From f9d2e4b742af9aea3c9ffa822397c1b86cec9304 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 13 Sep 2025 00:38:12 +0300 Subject: [PATCH 274/759] convert WanCameraEmbedding node to V3 schema (#9714) --- comfy_extras/nodes_camera_trajectory.py | 81 ++++++++++++++++--------- 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/comfy_extras/nodes_camera_trajectory.py b/comfy_extras/nodes_camera_trajectory.py index 5e0e39f91..eb7ef363c 100644 --- a/comfy_extras/nodes_camera_trajectory.py +++ b/comfy_extras/nodes_camera_trajectory.py @@ -2,12 +2,12 @@ import nodes import torch import numpy as np from einops import rearrange +from typing_extensions import override import comfy.model_management +from comfy_api.latest import ComfyExtension, io -MAX_RESOLUTION = nodes.MAX_RESOLUTION - CAMERA_DICT = { "base_T_norm": 1.5, "base_angle": np.pi/3, @@ -148,32 +148,47 @@ def get_camera_motion(angle, T, speed, n=81): RT = np.stack(RT) return RT -class WanCameraEmbedding: +class WanCameraEmbedding(io.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "camera_pose":(["Static","Pan Up","Pan Down","Pan Left","Pan Right","Zoom In","Zoom Out","Anti Clockwise (ACW)", "ClockWise (CW)"],{"default":"Static"}), - "width": ("INT", {"default": 832, "min": 16, "max": MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 81, "min": 1, "max": MAX_RESOLUTION, "step": 4}), - }, - "optional":{ - "speed":("FLOAT",{"default":1.0, "min": 0, "max": 10.0, "step": 0.1}), - "fx":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.000000001}), - "fy":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.000000001}), - "cx":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.01}), - "cy":("FLOAT",{"default":0.5, "min": 0, "max": 1, "step": 0.01}), - } + def define_schema(cls): + return io.Schema( + node_id="WanCameraEmbedding", + category="camera", + inputs=[ + io.Combo.Input( + "camera_pose", + options=[ + "Static", + "Pan Up", + "Pan Down", + "Pan Left", + "Pan Right", + "Zoom In", + "Zoom Out", + "Anti Clockwise (ACW)", + "ClockWise (CW)", + ], + default="Static", + ), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Float.Input("speed", default=1.0, min=0, max=10.0, step=0.1, optional=True), + io.Float.Input("fx", default=0.5, min=0, max=1, step=0.000000001, optional=True), + io.Float.Input("fy", default=0.5, min=0, max=1, step=0.000000001, optional=True), + io.Float.Input("cx", default=0.5, min=0, max=1, step=0.01, optional=True), + io.Float.Input("cy", default=0.5, min=0, max=1, step=0.01, optional=True), + ], + outputs=[ + io.WanCameraEmbedding.Output(display_name="camera_embedding"), + io.Int.Output(display_name="width"), + io.Int.Output(display_name="height"), + io.Int.Output(display_name="length"), + ], + ) - } - - RETURN_TYPES = ("WAN_CAMERA_EMBEDDING","INT","INT","INT") - RETURN_NAMES = ("camera_embedding","width","height","length") - FUNCTION = "run" - CATEGORY = "camera" - - def run(self, camera_pose, width, height, length, speed=1.0, fx=0.5, fy=0.5, cx=0.5, cy=0.5): + @classmethod + def execute(cls, camera_pose, width, height, length, speed=1.0, fx=0.5, fy=0.5, cx=0.5, cy=0.5) -> io.NodeOutput: """ Use Camera trajectory as extrinsic parameters to calculate Plücker embeddings (Sitzmannet al., 2021) Adapted from https://github.com/aigc-apps/VideoX-Fun/blob/main/comfyui/comfyui_nodes.py @@ -210,9 +225,15 @@ class WanCameraEmbedding: control_camera_video = control_camera_video.contiguous().view(b, f // 4, 4, c, h, w).transpose(2, 3) control_camera_video = control_camera_video.contiguous().view(b, f // 4, c * 4, h, w).transpose(1, 2) - return (control_camera_video, width, height, length) + return io.NodeOutput(control_camera_video, width, height, length) -NODE_CLASS_MAPPINGS = { - "WanCameraEmbedding": WanCameraEmbedding, -} +class CameraTrajectoryExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + WanCameraEmbedding, + ] + +async def comfy_entrypoint() -> CameraTrajectoryExtension: + return CameraTrajectoryExtension() From dcb883498337bcb2758fa9e7b326ea3b63c6b8d4 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 13 Sep 2025 00:38:46 +0300 Subject: [PATCH 275/759] convert Cosmos nodes to V3 schema (#9721) --- comfy_extras/nodes_cosmos.py | 129 +++++++++++++++++++---------------- 1 file changed, 72 insertions(+), 57 deletions(-) diff --git a/comfy_extras/nodes_cosmos.py b/comfy_extras/nodes_cosmos.py index 4f4960551..7dd129d19 100644 --- a/comfy_extras/nodes_cosmos.py +++ b/comfy_extras/nodes_cosmos.py @@ -1,25 +1,32 @@ +from typing_extensions import override import nodes import torch import comfy.model_management import comfy.utils import comfy.latent_formats +from comfy_api.latest import ComfyExtension, io -class EmptyCosmosLatentVideo: + +class EmptyCosmosLatentVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "width": ("INT", {"default": 1280, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 704, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 121, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 8}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="EmptyCosmosLatentVideo", + category="latent/video", + inputs=[ + io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=121, min=1, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[io.Latent.Output()], + ) - CATEGORY = "latent/video" - - def generate(self, width, height, length, batch_size=1): + @classmethod + def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 8) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - return ({"samples": latent}, ) + return io.NodeOutput({"samples": latent}) def vae_encode_with_padding(vae, image, width, height, length, padding=0): @@ -33,31 +40,31 @@ def vae_encode_with_padding(vae, image, width, height, length, padding=0): return latent_temp[:, :, :latent_len] -class CosmosImageToVideoLatent: +class CosmosImageToVideoLatent(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"vae": ("VAE", ), - "width": ("INT", {"default": 1280, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 704, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 121, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 8}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"start_image": ("IMAGE", ), - "end_image": ("IMAGE", ), - }} + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="CosmosImageToVideoLatent", + category="conditioning/inpaint", + inputs=[ + io.Vae.Input("vae"), + io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=121, min=1, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("start_image", optional=True), + io.Image.Input("end_image", optional=True), + ], + outputs=[io.Latent.Output()], + ) - - RETURN_TYPES = ("LATENT",) - FUNCTION = "encode" - - CATEGORY = "conditioning/inpaint" - - def encode(self, vae, width, height, length, batch_size, start_image=None, end_image=None): + @classmethod + def execute(cls, vae, width, height, length, batch_size, start_image=None, end_image=None) -> io.NodeOutput: latent = torch.zeros([1, 16, ((length - 1) // 8) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) if start_image is None and end_image is None: out_latent = {} out_latent["samples"] = latent - return (out_latent,) + return io.NodeOutput(out_latent) mask = torch.ones([latent.shape[0], 1, ((length - 1) // 8) + 1, latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) @@ -74,33 +81,33 @@ class CosmosImageToVideoLatent: out_latent = {} out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1)) out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1)) - return (out_latent,) + return io.NodeOutput(out_latent) -class CosmosPredict2ImageToVideoLatent: +class CosmosPredict2ImageToVideoLatent(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"vae": ("VAE", ), - "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 93, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }, - "optional": {"start_image": ("IMAGE", ), - "end_image": ("IMAGE", ), - }} + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="CosmosPredict2ImageToVideoLatent", + category="conditioning/inpaint", + inputs=[ + io.Vae.Input("vae"), + io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=93, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("start_image", optional=True), + io.Image.Input("end_image", optional=True), + ], + outputs=[io.Latent.Output()], + ) - - RETURN_TYPES = ("LATENT",) - FUNCTION = "encode" - - CATEGORY = "conditioning/inpaint" - - def encode(self, vae, width, height, length, batch_size, start_image=None, end_image=None): + @classmethod + def execute(cls, vae, width, height, length, batch_size, start_image=None, end_image=None) -> io.NodeOutput: latent = torch.zeros([1, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) if start_image is None and end_image is None: out_latent = {} out_latent["samples"] = latent - return (out_latent,) + return io.NodeOutput(out_latent) mask = torch.ones([latent.shape[0], 1, ((length - 1) // 4) + 1, latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) @@ -119,10 +126,18 @@ class CosmosPredict2ImageToVideoLatent: latent = latent_format.process_out(latent) * mask + latent * (1.0 - mask) out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1)) out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1)) - return (out_latent,) + return io.NodeOutput(out_latent) -NODE_CLASS_MAPPINGS = { - "EmptyCosmosLatentVideo": EmptyCosmosLatentVideo, - "CosmosImageToVideoLatent": CosmosImageToVideoLatent, - "CosmosPredict2ImageToVideoLatent": CosmosPredict2ImageToVideoLatent, -} + +class CosmosExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + EmptyCosmosLatentVideo, + CosmosImageToVideoLatent, + CosmosPredict2ImageToVideoLatent, + ] + + +async def comfy_entrypoint() -> CosmosExtension: + return CosmosExtension() From ba68e83f1c103eb4cb57fe01328706a0574fff3c Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 13 Sep 2025 00:39:30 +0300 Subject: [PATCH 276/759] convert nodes_cond.py to V3 schema (#9719) --- comfy_extras/nodes_cond.py | 75 ++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 28 deletions(-) diff --git a/comfy_extras/nodes_cond.py b/comfy_extras/nodes_cond.py index 58c16f621..8b06e3de9 100644 --- a/comfy_extras/nodes_cond.py +++ b/comfy_extras/nodes_cond.py @@ -1,15 +1,25 @@ +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io -class CLIPTextEncodeControlnet: +class CLIPTextEncodeControlnet(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"clip": ("CLIP", ), "conditioning": ("CONDITIONING", ), "text": ("STRING", {"multiline": True, "dynamicPrompts": True})}} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="CLIPTextEncodeControlnet", + category="_for_testing/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.Conditioning.Input("conditioning"), + io.String.Input("text", multiline=True, dynamic_prompts=True), + ], + outputs=[io.Conditioning.Output()], + is_experimental=True, + ) - CATEGORY = "_for_testing/conditioning" - - def encode(self, clip, conditioning, text): + @classmethod + def execute(cls, clip, conditioning, text) -> io.NodeOutput: tokens = clip.tokenize(text) cond, pooled = clip.encode_from_tokens(tokens, return_pooled=True) c = [] @@ -18,32 +28,41 @@ class CLIPTextEncodeControlnet: n[1]['cross_attn_controlnet'] = cond n[1]['pooled_output_controlnet'] = pooled c.append(n) - return (c, ) + return io.NodeOutput(c) -class T5TokenizerOptions: +class T5TokenizerOptions(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "clip": ("CLIP", ), - "min_padding": ("INT", {"default": 0, "min": 0, "max": 10000, "step": 1}), - "min_length": ("INT", {"default": 0, "min": 0, "max": 10000, "step": 1}), - } - } + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="T5TokenizerOptions", + category="_for_testing/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.Int.Input("min_padding", default=0, min=0, max=10000, step=1), + io.Int.Input("min_length", default=0, min=0, max=10000, step=1), + ], + outputs=[io.Clip.Output()], + is_experimental=True, + ) - CATEGORY = "_for_testing/conditioning" - RETURN_TYPES = ("CLIP",) - FUNCTION = "set_options" - - def set_options(self, clip, min_padding, min_length): + @classmethod + def execute(cls, clip, min_padding, min_length) -> io.NodeOutput: clip = clip.clone() for t5_type in ["t5xxl", "pile_t5xl", "t5base", "mt5xl", "umt5xxl"]: clip.set_tokenizer_option("{}_min_padding".format(t5_type), min_padding) clip.set_tokenizer_option("{}_min_length".format(t5_type), min_length) - return (clip, ) + return io.NodeOutput(clip) -NODE_CLASS_MAPPINGS = { - "CLIPTextEncodeControlnet": CLIPTextEncodeControlnet, - "T5TokenizerOptions": T5TokenizerOptions, -} + +class CondExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + CLIPTextEncodeControlnet, + T5TokenizerOptions, + ] + + +async def comfy_entrypoint() -> CondExtension: + return CondExtension() From 53c9c7d39ad8a459e84a29e46a3e053154ef6013 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 13 Sep 2025 00:39:55 +0300 Subject: [PATCH 277/759] convert CFG nodes to V3 schema (#9717) --- comfy_extras/nodes_cfg.py | 71 +++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 26 deletions(-) diff --git a/comfy_extras/nodes_cfg.py b/comfy_extras/nodes_cfg.py index 5abdc115a..4ebb4b51e 100644 --- a/comfy_extras/nodes_cfg.py +++ b/comfy_extras/nodes_cfg.py @@ -1,5 +1,10 @@ +from typing_extensions import override + import torch +from comfy_api.latest import ComfyExtension, io + + # https://github.com/WeichenFan/CFG-Zero-star def optimized_scale(positive, negative): positive_flat = positive.reshape(positive.shape[0], -1) @@ -16,17 +21,20 @@ def optimized_scale(positive, negative): return st_star.reshape([positive.shape[0]] + [1] * (positive.ndim - 1)) -class CFGZeroStar: +class CFGZeroStar(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"model": ("MODEL",), - }} - RETURN_TYPES = ("MODEL",) - RETURN_NAMES = ("patched_model",) - FUNCTION = "patch" - CATEGORY = "advanced/guidance" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="CFGZeroStar", + category="advanced/guidance", + inputs=[ + io.Model.Input("model"), + ], + outputs=[io.Model.Output(display_name="patched_model")], + ) - def patch(self, model): + @classmethod + def execute(cls, model) -> io.NodeOutput: m = model.clone() def cfg_zero_star(args): guidance_scale = args['cond_scale'] @@ -38,21 +46,24 @@ class CFGZeroStar: return out + uncond_p * (alpha - 1.0) + guidance_scale * uncond_p * (1.0 - alpha) m.set_model_sampler_post_cfg_function(cfg_zero_star) - return (m, ) + return io.NodeOutput(m) -class CFGNorm: +class CFGNorm(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"model": ("MODEL",), - "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - RETURN_NAMES = ("patched_model",) - FUNCTION = "patch" - CATEGORY = "advanced/guidance" - EXPERIMENTAL = True + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="CFGNorm", + category="advanced/guidance", + inputs=[ + io.Model.Input("model"), + io.Float.Input("strength", default=1.0, min=0.0, max=100.0, step=0.01), + ], + outputs=[io.Model.Output(display_name="patched_model")], + is_experimental=True, + ) - def patch(self, model, strength): + @classmethod + def execute(cls, model, strength) -> io.NodeOutput: m = model.clone() def cfg_norm(args): cond_p = args['cond_denoised'] @@ -64,9 +75,17 @@ class CFGNorm: return pred_text_ * scale * strength m.set_model_sampler_post_cfg_function(cfg_norm) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "CFGZeroStar": CFGZeroStar, - "CFGNorm": CFGNorm, -} + +class CfgExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + CFGZeroStar, + CFGNorm, + ] + + +async def comfy_entrypoint() -> CfgExtension: + return CfgExtension() From af99928f2218fc240dcfb3688ec47317ca146a78 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 13 Sep 2025 00:40:34 +0300 Subject: [PATCH 278/759] convert Canny node to V3 schema (#9743) --- comfy_extras/nodes_canny.py | 46 +++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/comfy_extras/nodes_canny.py b/comfy_extras/nodes_canny.py index d85e6b856..576f3640a 100644 --- a/comfy_extras/nodes_canny.py +++ b/comfy_extras/nodes_canny.py @@ -1,25 +1,41 @@ from kornia.filters import canny +from typing_extensions import override + import comfy.model_management +from comfy_api.latest import ComfyExtension, io -class Canny: +class Canny(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"image": ("IMAGE",), - "low_threshold": ("FLOAT", {"default": 0.4, "min": 0.01, "max": 0.99, "step": 0.01}), - "high_threshold": ("FLOAT", {"default": 0.8, "min": 0.01, "max": 0.99, "step": 0.01}) - }} + def define_schema(cls): + return io.Schema( + node_id="Canny", + category="image/preprocessors", + inputs=[ + io.Image.Input("image"), + io.Float.Input("low_threshold", default=0.4, min=0.01, max=0.99, step=0.01), + io.Float.Input("high_threshold", default=0.8, min=0.01, max=0.99, step=0.01), + ], + outputs=[io.Image.Output()], + ) - RETURN_TYPES = ("IMAGE",) - FUNCTION = "detect_edge" + @classmethod + def detect_edge(cls, image, low_threshold, high_threshold): + # Deprecated: use the V3 schema's `execute` method instead of this. + return cls.execute(image, low_threshold, high_threshold) - CATEGORY = "image/preprocessors" - - def detect_edge(self, image, low_threshold, high_threshold): + @classmethod + def execute(cls, image, low_threshold, high_threshold) -> io.NodeOutput: output = canny(image.to(comfy.model_management.get_torch_device()).movedim(-1, 1), low_threshold, high_threshold) img_out = output[1].to(comfy.model_management.intermediate_device()).repeat(1, 3, 1, 1).movedim(1, -1) - return (img_out,) + return io.NodeOutput(img_out) -NODE_CLASS_MAPPINGS = { - "Canny": Canny, -} + +class CannyExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [Canny] + + +async def comfy_entrypoint() -> CannyExtension: + return CannyExtension() From 581bae2af30b0839a39734bd97006c4009f9d70a Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 13 Sep 2025 00:41:26 +0300 Subject: [PATCH 279/759] convert Moonvalley API nodes to the V3 schema (#9698) --- comfy_api_nodes/nodes_moonvalley.py | 572 +++++++++++++++------------- 1 file changed, 298 insertions(+), 274 deletions(-) diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 806a70e06..08e838fef 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -1,6 +1,7 @@ import logging from typing import Any, Callable, Optional, TypeVar import torch +from typing_extensions import override from comfy_api_nodes.util.validation_utils import ( get_image_dimensions, validate_image_dimensions, @@ -26,11 +27,9 @@ from comfy_api_nodes.apinode_utils import ( upload_images_to_comfyapi, upload_video_to_comfyapi, ) -from comfy_api_nodes.mapper_utils import model_field_to_node_input -from comfy_api.input.video_types import VideoInput -from comfy.comfy_types.node_typing import IO -from comfy_api.input_impl import VideoFromFile +from comfy_api.input import VideoInput +from comfy_api.latest import ComfyExtension, InputImpl, io as comfy_io import av import io @@ -362,7 +361,7 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: # Return as VideoFromFile using the buffer output_buffer.seek(0) - return VideoFromFile(output_buffer) + return InputImpl.VideoFromFile(output_buffer) except Exception as e: # Clean up on error @@ -373,166 +372,150 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: raise RuntimeError(f"Failed to trim video: {str(e)}") from e -# --- BaseMoonvalleyVideoNode --- -class BaseMoonvalleyVideoNode: - def parseWidthHeightFromRes(self, resolution: str): - # Accepts a string like "16:9 (1920 x 1080)" and returns width, height as a dict - res_map = { - "16:9 (1920 x 1080)": {"width": 1920, "height": 1080}, - "9:16 (1080 x 1920)": {"width": 1080, "height": 1920}, - "1:1 (1152 x 1152)": {"width": 1152, "height": 1152}, - "4:3 (1536 x 1152)": {"width": 1536, "height": 1152}, - "3:4 (1152 x 1536)": {"width": 1152, "height": 1536}, - "21:9 (2560 x 1080)": {"width": 2560, "height": 1080}, - } - if resolution in res_map: - return res_map[resolution] - else: - # Default to 1920x1080 if unknown - return {"width": 1920, "height": 1080} +def parse_width_height_from_res(resolution: str): + # Accepts a string like "16:9 (1920 x 1080)" and returns width, height as a dict + res_map = { + "16:9 (1920 x 1080)": {"width": 1920, "height": 1080}, + "9:16 (1080 x 1920)": {"width": 1080, "height": 1920}, + "1:1 (1152 x 1152)": {"width": 1152, "height": 1152}, + "4:3 (1536 x 1152)": {"width": 1536, "height": 1152}, + "3:4 (1152 x 1536)": {"width": 1152, "height": 1536}, + "21:9 (2560 x 1080)": {"width": 2560, "height": 1080}, + } + return res_map.get(resolution, {"width": 1920, "height": 1080}) - def parseControlParameter(self, value): - control_map = { - "Motion Transfer": "motion_control", - "Canny": "canny_control", - "Pose Transfer": "pose_control", - "Depth": "depth_control", - } - if value in control_map: - return control_map[value] - else: - return control_map["Motion Transfer"] - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> MoonvalleyPromptResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{API_PROMPTS_ENDPOINT}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MoonvalleyPromptResponse, - ), - result_url_extractor=get_video_url_from_response, - node_id=node_id, - ) +def parse_control_parameter(value): + control_map = { + "Motion Transfer": "motion_control", + "Canny": "canny_control", + "Pose Transfer": "pose_control", + "Depth": "depth_control", + } + return control_map.get(value, control_map["Motion Transfer"]) + + +async def get_response( + task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None +) -> MoonvalleyPromptResponse: + return await poll_until_finished( + auth_kwargs, + ApiEndpoint( + path=f"{API_PROMPTS_ENDPOINT}/{task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=MoonvalleyPromptResponse, + ), + result_url_extractor=get_video_url_from_response, + node_id=node_id, + ) + + +class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, - MoonvalleyTextToVideoRequest, - "prompt_text", + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="MoonvalleyImg2VideoNode", + display_name="Moonvalley Marey Image to Video", + category="api node/video/Moonvalley Marey", + description="Moonvalley Marey Image to Video Node", + inputs=[ + comfy_io.Image.Input( + "image", + tooltip="The reference image used to generate the video", + ), + comfy_io.String.Input( + "prompt", multiline=True, ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - MoonvalleyTextToVideoInferenceParams, + comfy_io.String.Input( "negative_prompt", multiline=True, - default=" gopro, bright, contrast, static, overexposed, vignette, artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, wobbly, weird, low quality, plastic, stock footage, video camera, boring", + default=" gopro, bright, contrast, static, overexposed, vignette, " + "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " + "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " + "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " + "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " + "wobbly, weird, low quality, plastic, stock footage, video camera, boring", + tooltip="Negative prompt text", ), - "resolution": ( - IO.COMBO, - { - "options": [ - "16:9 (1920 x 1080)", - "9:16 (1080 x 1920)", - "1:1 (1152 x 1152)", - "4:3 (1440 x 1080)", - "3:4 (1080 x 1440)", - "21:9 (2560 x 1080)", - ], - "default": "16:9 (1920 x 1080)", - "tooltip": "Resolution of the output video", - }, + comfy_io.Combo.Input( + "resolution", + options=[ + "16:9 (1920 x 1080)", + "9:16 (1080 x 1920)", + "1:1 (1152 x 1152)", + "4:3 (1536 x 1152)", + "3:4 (1152 x 1536)", + "21:9 (2560 x 1080)", + ], + default="16:9 (1920 x 1080)", + tooltip="Resolution of the output video", ), - "prompt_adherence": model_field_to_node_input( - IO.FLOAT, - MoonvalleyTextToVideoInferenceParams, - "guidance_scale", + comfy_io.Float.Input( + "prompt_adherence", default=10.0, - step=1, - min=1, - max=20, + min=1.0, + max=20.0, + step=1.0, + tooltip="Guidance scale for generation control", ), - "seed": model_field_to_node_input( - IO.INT, - MoonvalleyTextToVideoInferenceParams, + comfy_io.Int.Input( "seed", default=9, min=0, max=4294967295, step=1, - display="number", + display_mode=comfy_io.NumberDisplay.number, tooltip="Random seed value", ), - "steps": model_field_to_node_input( - IO.INT, - MoonvalleyTextToVideoInferenceParams, + comfy_io.Int.Input( "steps", default=100, min=1, max=100, + step=1, + tooltip="Number of denoising steps", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - "optional": { - "image": model_field_to_node_input( - IO.IMAGE, - MoonvalleyTextToVideoRequest, - "image_url", - tooltip="The reference image used to generate the video", - ), - }, - } - - RETURN_TYPES = ("STRING",) - FUNCTION = "generate" - CATEGORY = "api node/video/Moonvalley Marey" - API_NODE = True - - def generate(self, **kwargs): - return None - - -# --- MoonvalleyImg2VideoNode --- -class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(cls): - return super().INPUT_TYPES() - - RETURN_TYPES = ("VIDEO",) - RETURN_NAMES = ("video",) - DESCRIPTION = "Moonvalley Marey Image to Video Node" - - async def generate( - self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs - ): - image = kwargs.get("image", None) - if image is None: - raise MoonvalleyApiError("image is required") - + async def execute( + cls, + image: torch.Tensor, + prompt: str, + negative_prompt: str, + resolution: str, + prompt_adherence: float, + seed: int, + steps: int, + ) -> comfy_io.NodeOutput: validate_input_image(image, True) validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) - width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) + width_height = parse_width_height_from_res(resolution) + + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } inference_params = MoonvalleyTextToVideoInferenceParams( negative_prompt=negative_prompt, - steps=kwargs.get("steps"), - seed=kwargs.get("seed"), - guidance_scale=kwargs.get("prompt_adherence"), + steps=steps, + seed=seed, + guidance_scale=prompt_adherence, num_frames=128, - width=width_height.get("width"), - height=width_height.get("height"), + width=width_height["width"], + height=width_height["height"], use_negative_prompts=True, ) """Upload image to comfy backend to have a URL available for further processing""" @@ -541,7 +524,7 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): image_url = ( await upload_images_to_comfyapi( - image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type + image, max_images=1, auth_kwargs=auth, mime_type=mime_type ) )[0] @@ -556,127 +539,102 @@ class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode): response_model=MoonvalleyPromptResponse, ), request=request, - auth_kwargs=kwargs, + auth_kwargs=auth, ) task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id + final_response = await get_response( + task_id, auth_kwargs=auth, node_id=cls.hidden.unique_id ) video = await download_url_to_video_output(final_response.output_url) - return (video,) + return comfy_io.NodeOutput(video) -# --- MoonvalleyVid2VidNode --- -class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): - def __init__(self): - super().__init__() +class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, - MoonvalleyVideoToVideoRequest, - "prompt_text", + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="MoonvalleyVideo2VideoNode", + display_name="Moonvalley Marey Video to Video", + category="api node/video/Moonvalley Marey", + description="", + inputs=[ + comfy_io.String.Input( + "prompt", multiline=True, + tooltip="Describes the video to generate", ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - MoonvalleyVideoToVideoInferenceParams, + comfy_io.String.Input( "negative_prompt", multiline=True, - default=" gopro, bright, contrast, static, overexposed, vignette, artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, wobbly, weird, low quality, plastic, stock footage, video camera, boring", + default=" gopro, bright, contrast, static, overexposed, vignette, " + "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " + "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " + "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " + "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " + "wobbly, weird, low quality, plastic, stock footage, video camera, boring", + tooltip="Negative prompt text", ), - "seed": model_field_to_node_input( - IO.INT, - MoonvalleyVideoToVideoInferenceParams, + comfy_io.Int.Input( "seed", default=9, min=0, max=4294967295, step=1, - display="number", + display_mode=comfy_io.NumberDisplay.number, tooltip="Random seed value", control_after_generate=False, ), - "prompt_adherence": model_field_to_node_input( - IO.FLOAT, - MoonvalleyVideoToVideoInferenceParams, - "guidance_scale", - default=10.0, + comfy_io.Video.Input( + "video", + tooltip="The reference video used to generate the output video. Must be at least 5 seconds long. " + "Videos longer than 5s will be automatically trimmed. Only MP4 format supported.", + ), + comfy_io.Combo.Input( + "control_type", + options=["Motion Transfer", "Pose Transfer"], + default="Motion Transfer", + optional=True, + ), + comfy_io.Int.Input( + "motion_intensity", + default=100, + min=0, + max=100, step=1, - min=1, - max=20, + tooltip="Only used if control_type is 'Motion Transfer'", + optional=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - "optional": { - "video": ( - IO.VIDEO, - { - "default": "", - "multiline": False, - "tooltip": "The reference video used to generate the output video. Must be at least 5 seconds long. Videos longer than 5s will be automatically trimmed. Only MP4 format supported.", - }, - ), - "control_type": ( - ["Motion Transfer", "Pose Transfer"], - {"default": "Motion Transfer"}, - ), - "motion_intensity": ( - "INT", - { - "default": 100, - "step": 1, - "min": 0, - "max": 100, - "tooltip": "Only used if control_type is 'Motion Transfer'", - }, - ), - "image": model_field_to_node_input( - IO.IMAGE, - MoonvalleyTextToVideoRequest, - "image_url", - tooltip="The reference image used to generate the video", - ), - }, + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + prompt: str, + negative_prompt: str, + seed: int, + video: Optional[VideoInput] = None, + control_type: str = "Motion Transfer", + motion_intensity: Optional[int] = 100, + ) -> comfy_io.NodeOutput: + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, } - RETURN_TYPES = ("VIDEO",) - RETURN_NAMES = ("video",) - - async def generate( - self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs - ): - video = kwargs.get("video") - image = kwargs.get("image", None) - - if not video: - raise MoonvalleyApiError("video is required") - - video_url = "" - if video: - validated_video = validate_video_to_video_input(video) - video_url = await upload_video_to_comfyapi( - validated_video, auth_kwargs=kwargs - ) - mime_type = "image/png" - - if not image is None: - validate_input_image(image, with_frame_conditioning=True) - image_url = await upload_images_to_comfyapi( - image=image, auth_kwargs=kwargs, max_images=1, mime_type=mime_type - ) - control_type = kwargs.get("control_type") - motion_intensity = kwargs.get("motion_intensity") + validated_video = validate_video_to_video_input(video) + video_url = await upload_video_to_comfyapi(validated_video, auth_kwargs=auth) """Validate prompts and inference input""" validate_prompts(prompt, negative_prompt) @@ -688,11 +646,11 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): inference_params = MoonvalleyVideoToVideoInferenceParams( negative_prompt=negative_prompt, - seed=kwargs.get("seed"), + seed=seed, control_params=control_params, ) - control = self.parseControlParameter(control_type) + control = parse_control_parameter(control_type) request = MoonvalleyVideoToVideoRequest( control_type=control, @@ -700,7 +658,6 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): prompt_text=prompt, inference_params=inference_params, ) - request.image_url = image_url if not image is None else None initial_operation = SynchronousOperation( endpoint=ApiEndpoint( @@ -710,58 +667,125 @@ class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode): response_model=MoonvalleyPromptResponse, ), request=request, - auth_kwargs=kwargs, + auth_kwargs=auth, ) task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id + final_response = await get_response( + task_id, auth_kwargs=auth, node_id=cls.hidden.unique_id ) video = await download_url_to_video_output(final_response.output_url) - - return (video,) + return comfy_io.NodeOutput(video) -# --- MoonvalleyTxt2VideoNode --- -class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): - def __init__(self): - super().__init__() - - RETURN_TYPES = ("VIDEO",) - RETURN_NAMES = ("video",) +class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(cls): - input_types = super().INPUT_TYPES() - # Remove image-specific parameters - for param in ["image"]: - if param in input_types["optional"]: - del input_types["optional"][param] - return input_types + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="MoonvalleyTxt2VideoNode", + display_name="Moonvalley Marey Text to Video", + category="api node/video/Moonvalley Marey", + description="", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + ), + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default=" gopro, bright, contrast, static, overexposed, vignette, " + "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " + "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " + "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " + "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " + "wobbly, weird, low quality, plastic, stock footage, video camera, boring", + tooltip="Negative prompt text", + ), + comfy_io.Combo.Input( + "resolution", + options=[ + "16:9 (1920 x 1080)", + "9:16 (1080 x 1920)", + "1:1 (1152 x 1152)", + "4:3 (1536 x 1152)", + "3:4 (1152 x 1536)", + "21:9 (2560 x 1080)", + ], + default="16:9 (1920 x 1080)", + tooltip="Resolution of the output video", + ), + comfy_io.Float.Input( + "prompt_adherence", + default=10.0, + min=1.0, + max=20.0, + step=1.0, + tooltip="Guidance scale for generation control", + ), + comfy_io.Int.Input( + "seed", + default=9, + min=0, + max=4294967295, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Random seed value", + ), + comfy_io.Int.Input( + "steps", + default=100, + min=1, + max=100, + step=1, + tooltip="Inference steps", + ), + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - async def generate( - self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs - ): + @classmethod + async def execute( + cls, + prompt: str, + negative_prompt: str, + resolution: str, + prompt_adherence: float, + seed: int, + steps: int, + ) -> comfy_io.NodeOutput: validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) - width_height = self.parseWidthHeightFromRes(kwargs.get("resolution")) + width_height = parse_width_height_from_res(resolution) + + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } inference_params = MoonvalleyTextToVideoInferenceParams( negative_prompt=negative_prompt, - steps=kwargs.get("steps"), - seed=kwargs.get("seed"), - guidance_scale=kwargs.get("prompt_adherence"), + steps=steps, + seed=seed, + guidance_scale=prompt_adherence, num_frames=128, - width=width_height.get("width"), - height=width_height.get("height"), + width=width_height["width"], + height=width_height["height"], ) request = MoonvalleyTextToVideoRequest( prompt_text=prompt, inference_params=inference_params ) - initial_operation = SynchronousOperation( + init_op = SynchronousOperation( endpoint=ApiEndpoint( path=API_TXT2VIDEO_ENDPOINT, method=HttpMethod.POST, @@ -769,29 +793,29 @@ class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode): response_model=MoonvalleyPromptResponse, ), request=request, - auth_kwargs=kwargs, + auth_kwargs=auth, ) - task_creation_response = await initial_operation.execute() + task_creation_response = await init_op.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id + final_response = await get_response( + task_id, auth_kwargs=auth, node_id=cls.hidden.unique_id ) video = await download_url_to_video_output(final_response.output_url) - return (video,) + return comfy_io.NodeOutput(video) -NODE_CLASS_MAPPINGS = { - "MoonvalleyImg2VideoNode": MoonvalleyImg2VideoNode, - "MoonvalleyTxt2VideoNode": MoonvalleyTxt2VideoNode, - "MoonvalleyVideo2VideoNode": MoonvalleyVideo2VideoNode, -} +class MoonvalleyExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + MoonvalleyImg2VideoNode, + MoonvalleyTxt2VideoNode, + MoonvalleyVideo2VideoNode, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "MoonvalleyImg2VideoNode": "Moonvalley Marey Image to Video", - "MoonvalleyTxt2VideoNode": "Moonvalley Marey Text to Video", - "MoonvalleyVideo2VideoNode": "Moonvalley Marey Video to Video", -} +async def comfy_entrypoint() -> MoonvalleyExtension: + return MoonvalleyExtension() From b149e2e1e302e75ce5b47e9b823b42b304d70b4b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 12 Sep 2025 14:53:15 -0700 Subject: [PATCH 280/759] Better way of doing the generator for the hunyuan image noise aug. (#9834) --- comfy/model_base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 4176bca25..324d89cff 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1448,7 +1448,9 @@ class HunyuanImage21Refiner(HunyuanImage21): image = self.process_latent_in(image) image = utils.resize_to_batch_size(image, noise.shape[0]) if noise_augmentation > 0: - noise = torch.randn(image.shape, generator=torch.manual_seed(kwargs.get("seed", 0) - 10), dtype=image.dtype, device="cpu").to(image.device) + generator = torch.Generator(device="cpu") + generator.manual_seed(kwargs.get("seed", 0) - 10) + noise = torch.randn(image.shape, generator=generator, dtype=image.dtype, device="cpu").to(image.device) image = noise_augmentation * noise + min(1.0 - noise_augmentation, 0.75) * image else: image = 0.75 * image From d7f40442f91a02946cab7445c6204bf154b1e86f Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Fri, 12 Sep 2025 15:07:38 -0700 Subject: [PATCH 281/759] Enable Runtime Selection of Attention Functions (#9639) * Looking into a @wrap_attn decorator to look for 'optimized_attention_override' entry in transformer_options * Created logging code for this branch so that it can be used to track down all the code paths where transformer_options would need to be added * Fix memory usage issue with inspect * Made WAN attention receive transformer_options, test node added to wan to test out attention override later * Added **kwargs to all attention functions so transformer_options could potentially be passed through * Make sure wrap_attn doesn't make itself recurse infinitely, attempt to load SageAttention and FlashAttention if not enabled so that they can be marked as available or not, create registry for available attention * Turn off attention logging for now, make AttentionOverrideTestNode have a dropdown with available attention (this is a test node only) * Make flux work with optimized_attention_override * Add logs to verify optimized_attention_override is passed all the way into attention function * Make Qwen work with optimized_attention_override * Made hidream work with optimized_attention_override * Made wan patches_replace work with optimized_attention_override * Made SD3 work with optimized_attention_override * Made HunyuanVideo work with optimized_attention_override * Made Mochi work with optimized_attention_override * Made LTX work with optimized_attention_override * Made StableAudio work with optimized_attention_override * Made optimized_attention_override work with ACE Step * Made Hunyuan3D work with optimized_attention_override * Make CosmosPredict2 work with optimized_attention_override * Made CosmosVideo work with optimized_attention_override * Made Omnigen 2 work with optimized_attention_override * Made StableCascade work with optimized_attention_override * Made AuraFlow work with optimized_attention_override * Made Lumina work with optimized_attention_override * Made Chroma work with optimized_attention_override * Made SVD work with optimized_attention_override * Fix WanI2VCrossAttention so that it expects to receive transformer_options * Fixed Wan2.1 Fun Camera transformer_options passthrough * Fixed WAN 2.1 VACE transformer_options passthrough * Add optimized to get_attention_function * Disable attention logs for now * Remove attention logging code * Remove _register_core_attention_functions, as we wouldn't want someone to call that, just in case * Satisfy ruff * Remove AttentionOverrideTest node, that's something to cook up for later --- comfy/ldm/ace/attention.py | 9 +- comfy/ldm/ace/model.py | 4 + comfy/ldm/audio/dit.py | 25 ++-- comfy/ldm/aura/mmdit.py | 29 ++--- comfy/ldm/cascade/common.py | 12 +- comfy/ldm/cascade/stage_b.py | 14 +-- comfy/ldm/cascade/stage_c.py | 14 +-- comfy/ldm/chroma/layers.py | 8 +- comfy/ldm/chroma/model.py | 17 ++- comfy/ldm/cosmos/blocks.py | 10 +- comfy/ldm/cosmos/model.py | 2 + comfy/ldm/cosmos/predict2.py | 17 ++- comfy/ldm/flux/layers.py | 10 +- comfy/ldm/flux/math.py | 4 +- comfy/ldm/flux/model.py | 17 ++- .../genmo/joint_model/asymm_models_joint.py | 11 +- comfy/ldm/hidream/model.py | 18 ++- comfy/ldm/hunyuan3d/model.py | 17 ++- comfy/ldm/hunyuan_video/model.py | 25 ++-- comfy/ldm/lightricks/model.py | 19 +-- comfy/ldm/lumina/model.py | 17 ++- comfy/ldm/modules/attention.py | 114 +++++++++++++----- comfy/ldm/modules/diffusionmodules/mmdit.py | 9 +- comfy/ldm/omnigen/omnigen2.py | 23 ++-- comfy/ldm/qwen_image/model.py | 12 +- comfy/ldm/wan/model.py | 38 +++--- 26 files changed, 316 insertions(+), 179 deletions(-) diff --git a/comfy/ldm/ace/attention.py b/comfy/ldm/ace/attention.py index f20a01669..670eb9783 100644 --- a/comfy/ldm/ace/attention.py +++ b/comfy/ldm/ace/attention.py @@ -133,6 +133,7 @@ class Attention(nn.Module): hidden_states: torch.Tensor, encoder_hidden_states: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, + transformer_options={}, **cross_attention_kwargs, ) -> torch.Tensor: return self.processor( @@ -140,6 +141,7 @@ class Attention(nn.Module): hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, + transformer_options=transformer_options, **cross_attention_kwargs, ) @@ -366,6 +368,7 @@ class CustomerAttnProcessor2_0: encoder_attention_mask: Optional[torch.FloatTensor] = None, rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None, rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None, + transformer_options={}, *args, **kwargs, ) -> torch.Tensor: @@ -433,7 +436,7 @@ class CustomerAttnProcessor2_0: # the output of sdp = (batch, num_heads, seq_len, head_dim) hidden_states = optimized_attention( - query, key, value, heads=query.shape[1], mask=attention_mask, skip_reshape=True, + query, key, value, heads=query.shape[1], mask=attention_mask, skip_reshape=True, transformer_options=transformer_options, ).to(query.dtype) # linear proj @@ -697,6 +700,7 @@ class LinearTransformerBlock(nn.Module): rotary_freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]] = None, rotary_freqs_cis_cross: Union[torch.Tensor, Tuple[torch.Tensor]] = None, temb: torch.FloatTensor = None, + transformer_options={}, ): N = hidden_states.shape[0] @@ -720,6 +724,7 @@ class LinearTransformerBlock(nn.Module): encoder_attention_mask=encoder_attention_mask, rotary_freqs_cis=rotary_freqs_cis, rotary_freqs_cis_cross=rotary_freqs_cis_cross, + transformer_options=transformer_options, ) else: attn_output, _ = self.attn( @@ -729,6 +734,7 @@ class LinearTransformerBlock(nn.Module): encoder_attention_mask=None, rotary_freqs_cis=rotary_freqs_cis, rotary_freqs_cis_cross=None, + transformer_options=transformer_options, ) if self.use_adaln_single: @@ -743,6 +749,7 @@ class LinearTransformerBlock(nn.Module): encoder_attention_mask=encoder_attention_mask, rotary_freqs_cis=rotary_freqs_cis, rotary_freqs_cis_cross=rotary_freqs_cis_cross, + transformer_options=transformer_options, ) hidden_states = attn_output + hidden_states diff --git a/comfy/ldm/ace/model.py b/comfy/ldm/ace/model.py index 41d85eeb5..399329853 100644 --- a/comfy/ldm/ace/model.py +++ b/comfy/ldm/ace/model.py @@ -314,6 +314,7 @@ class ACEStepTransformer2DModel(nn.Module): output_length: int = 0, block_controlnet_hidden_states: Optional[Union[List[torch.Tensor], torch.Tensor]] = None, controlnet_scale: Union[float, torch.Tensor] = 1.0, + transformer_options={}, ): embedded_timestep = self.timestep_embedder(self.time_proj(timestep).to(dtype=hidden_states.dtype)) temb = self.t_block(embedded_timestep) @@ -339,6 +340,7 @@ class ACEStepTransformer2DModel(nn.Module): rotary_freqs_cis=rotary_freqs_cis, rotary_freqs_cis_cross=encoder_rotary_freqs_cis, temb=temb, + transformer_options=transformer_options, ) output = self.final_layer(hidden_states, embedded_timestep, output_length) @@ -393,6 +395,7 @@ class ACEStepTransformer2DModel(nn.Module): output_length = hidden_states.shape[-1] + transformer_options = kwargs.get("transformer_options", {}) output = self.decode( hidden_states=hidden_states, attention_mask=attention_mask, @@ -402,6 +405,7 @@ class ACEStepTransformer2DModel(nn.Module): output_length=output_length, block_controlnet_hidden_states=block_controlnet_hidden_states, controlnet_scale=controlnet_scale, + transformer_options=transformer_options, ) return output diff --git a/comfy/ldm/audio/dit.py b/comfy/ldm/audio/dit.py index d0d69bbdc..ca865189e 100644 --- a/comfy/ldm/audio/dit.py +++ b/comfy/ldm/audio/dit.py @@ -298,7 +298,8 @@ class Attention(nn.Module): mask = None, context_mask = None, rotary_pos_emb = None, - causal = None + causal = None, + transformer_options={}, ): h, kv_h, has_context = self.num_heads, self.kv_heads, context is not None @@ -363,7 +364,7 @@ class Attention(nn.Module): heads_per_kv_head = h // kv_h k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim = 1), (k, v)) - out = optimized_attention(q, k, v, h, skip_reshape=True) + out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options) out = self.to_out(out) if mask is not None: @@ -488,7 +489,8 @@ class TransformerBlock(nn.Module): global_cond=None, mask = None, context_mask = None, - rotary_pos_emb = None + rotary_pos_emb = None, + transformer_options={} ): if self.global_cond_dim is not None and self.global_cond_dim > 0 and global_cond is not None: @@ -498,12 +500,12 @@ class TransformerBlock(nn.Module): residual = x x = self.pre_norm(x) x = x * (1 + scale_self) + shift_self - x = self.self_attn(x, mask = mask, rotary_pos_emb = rotary_pos_emb) + x = self.self_attn(x, mask = mask, rotary_pos_emb = rotary_pos_emb, transformer_options=transformer_options) x = x * torch.sigmoid(1 - gate_self) x = x + residual if context is not None: - x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask) + x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask, transformer_options=transformer_options) if self.conformer is not None: x = x + self.conformer(x) @@ -517,10 +519,10 @@ class TransformerBlock(nn.Module): x = x + residual else: - x = x + self.self_attn(self.pre_norm(x), mask = mask, rotary_pos_emb = rotary_pos_emb) + x = x + self.self_attn(self.pre_norm(x), mask = mask, rotary_pos_emb = rotary_pos_emb, transformer_options=transformer_options) if context is not None: - x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask) + x = x + self.cross_attn(self.cross_attend_norm(x), context = context, context_mask = context_mask, transformer_options=transformer_options) if self.conformer is not None: x = x + self.conformer(x) @@ -606,7 +608,8 @@ class ContinuousTransformer(nn.Module): return_info = False, **kwargs ): - patches_replace = kwargs.get("transformer_options", {}).get("patches_replace", {}) + transformer_options = kwargs.get("transformer_options", {}) + patches_replace = transformer_options.get("patches_replace", {}) batch, seq, device = *x.shape[:2], x.device context = kwargs["context"] @@ -645,13 +648,13 @@ class ContinuousTransformer(nn.Module): if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} - out["img"] = layer(args["img"], rotary_pos_emb=args["pe"], global_cond=args["vec"], context=args["txt"]) + out["img"] = layer(args["img"], rotary_pos_emb=args["pe"], global_cond=args["vec"], context=args["txt"], transformer_options=args["transformer_options"]) return out - out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb}, {"original_block": block_wrap}) + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb, "transformer_options": transformer_options}, {"original_block": block_wrap}) x = out["img"] else: - x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context) + x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context, transformer_options=transformer_options) # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs) if return_info: diff --git a/comfy/ldm/aura/mmdit.py b/comfy/ldm/aura/mmdit.py index d7f32b5e8..66d9613b6 100644 --- a/comfy/ldm/aura/mmdit.py +++ b/comfy/ldm/aura/mmdit.py @@ -85,7 +85,7 @@ class SingleAttention(nn.Module): ) #@torch.compile() - def forward(self, c): + def forward(self, c, transformer_options={}): bsz, seqlen1, _ = c.shape @@ -95,7 +95,7 @@ class SingleAttention(nn.Module): v = v.view(bsz, seqlen1, self.n_heads, self.head_dim) q, k = self.q_norm1(q), self.k_norm1(k) - output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True) + output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True, transformer_options=transformer_options) c = self.w1o(output) return c @@ -144,7 +144,7 @@ class DoubleAttention(nn.Module): #@torch.compile() - def forward(self, c, x): + def forward(self, c, x, transformer_options={}): bsz, seqlen1, _ = c.shape bsz, seqlen2, _ = x.shape @@ -168,7 +168,7 @@ class DoubleAttention(nn.Module): torch.cat([cv, xv], dim=1), ) - output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True) + output = optimized_attention(q.permute(0, 2, 1, 3), k.permute(0, 2, 1, 3), v.permute(0, 2, 1, 3), self.n_heads, skip_reshape=True, transformer_options=transformer_options) c, x = output.split([seqlen1, seqlen2], dim=1) c = self.w1o(c) @@ -207,7 +207,7 @@ class MMDiTBlock(nn.Module): self.is_last = is_last #@torch.compile() - def forward(self, c, x, global_cond, **kwargs): + def forward(self, c, x, global_cond, transformer_options={}, **kwargs): cres, xres = c, x @@ -225,7 +225,7 @@ class MMDiTBlock(nn.Module): x = modulate(self.normX1(x), xshift_msa, xscale_msa) # attention - c, x = self.attn(c, x) + c, x = self.attn(c, x, transformer_options=transformer_options) c = self.normC2(cres + cgate_msa.unsqueeze(1) * c) @@ -255,13 +255,13 @@ class DiTBlock(nn.Module): self.mlp = MLP(dim, hidden_dim=dim * 4, dtype=dtype, device=device, operations=operations) #@torch.compile() - def forward(self, cx, global_cond, **kwargs): + def forward(self, cx, global_cond, transformer_options={}, **kwargs): cxres = cx shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.modCX( global_cond ).chunk(6, dim=1) cx = modulate(self.norm1(cx), shift_msa, scale_msa) - cx = self.attn(cx) + cx = self.attn(cx, transformer_options=transformer_options) cx = self.norm2(cxres + gate_msa.unsqueeze(1) * cx) mlpout = self.mlp(modulate(cx, shift_mlp, scale_mlp)) cx = gate_mlp.unsqueeze(1) * mlpout @@ -473,13 +473,14 @@ class MMDiT(nn.Module): out = {} out["txt"], out["img"] = layer(args["txt"], args["img"], - args["vec"]) + args["vec"], + transformer_options=args["transformer_options"]) return out - out = blocks_replace[("double_block", i)]({"img": x, "txt": c, "vec": global_cond}, {"original_block": block_wrap}) + out = blocks_replace[("double_block", i)]({"img": x, "txt": c, "vec": global_cond, "transformer_options": transformer_options}, {"original_block": block_wrap}) c = out["txt"] x = out["img"] else: - c, x = layer(c, x, global_cond, **kwargs) + c, x = layer(c, x, global_cond, transformer_options=transformer_options, **kwargs) if len(self.single_layers) > 0: c_len = c.size(1) @@ -488,13 +489,13 @@ class MMDiT(nn.Module): if ("single_block", i) in blocks_replace: def block_wrap(args): out = {} - out["img"] = layer(args["img"], args["vec"]) + out["img"] = layer(args["img"], args["vec"], transformer_options=args["transformer_options"]) return out - out = blocks_replace[("single_block", i)]({"img": cx, "vec": global_cond}, {"original_block": block_wrap}) + out = blocks_replace[("single_block", i)]({"img": cx, "vec": global_cond, "transformer_options": transformer_options}, {"original_block": block_wrap}) cx = out["img"] else: - cx = layer(cx, global_cond, **kwargs) + cx = layer(cx, global_cond, transformer_options=transformer_options, **kwargs) x = cx[:, c_len:] diff --git a/comfy/ldm/cascade/common.py b/comfy/ldm/cascade/common.py index 3eaa0c821..42ef98c7a 100644 --- a/comfy/ldm/cascade/common.py +++ b/comfy/ldm/cascade/common.py @@ -32,12 +32,12 @@ class OptimizedAttention(nn.Module): self.out_proj = operations.Linear(c, c, bias=True, dtype=dtype, device=device) - def forward(self, q, k, v): + def forward(self, q, k, v, transformer_options={}): q = self.to_q(q) k = self.to_k(k) v = self.to_v(v) - out = optimized_attention(q, k, v, self.heads) + out = optimized_attention(q, k, v, self.heads, transformer_options=transformer_options) return self.out_proj(out) @@ -47,13 +47,13 @@ class Attention2D(nn.Module): self.attn = OptimizedAttention(c, nhead, dtype=dtype, device=device, operations=operations) # self.attn = nn.MultiheadAttention(c, nhead, dropout=dropout, bias=True, batch_first=True, dtype=dtype, device=device) - def forward(self, x, kv, self_attn=False): + def forward(self, x, kv, self_attn=False, transformer_options={}): orig_shape = x.shape x = x.view(x.size(0), x.size(1), -1).permute(0, 2, 1) # Bx4xHxW -> Bx(HxW)x4 if self_attn: kv = torch.cat([x, kv], dim=1) # x = self.attn(x, kv, kv, need_weights=False)[0] - x = self.attn(x, kv, kv) + x = self.attn(x, kv, kv, transformer_options=transformer_options) x = x.permute(0, 2, 1).view(*orig_shape) return x @@ -114,9 +114,9 @@ class AttnBlock(nn.Module): operations.Linear(c_cond, c, dtype=dtype, device=device) ) - def forward(self, x, kv): + def forward(self, x, kv, transformer_options={}): kv = self.kv_mapper(kv) - x = x + self.attention(self.norm(x), kv, self_attn=self.self_attn) + x = x + self.attention(self.norm(x), kv, self_attn=self.self_attn, transformer_options=transformer_options) return x diff --git a/comfy/ldm/cascade/stage_b.py b/comfy/ldm/cascade/stage_b.py index 773830956..428c67fdf 100644 --- a/comfy/ldm/cascade/stage_b.py +++ b/comfy/ldm/cascade/stage_b.py @@ -173,7 +173,7 @@ class StageB(nn.Module): clip = self.clip_norm(clip) return clip - def _down_encode(self, x, r_embed, clip): + def _down_encode(self, x, r_embed, clip, transformer_options={}): level_outputs = [] block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers) for down_block, downscaler, repmap in block_group: @@ -187,7 +187,7 @@ class StageB(nn.Module): elif isinstance(block, AttnBlock) or ( hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module, AttnBlock)): - x = block(x, clip) + x = block(x, clip, transformer_options=transformer_options) elif isinstance(block, TimestepBlock) or ( hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module, TimestepBlock)): @@ -199,7 +199,7 @@ class StageB(nn.Module): level_outputs.insert(0, x) return level_outputs - def _up_decode(self, level_outputs, r_embed, clip): + def _up_decode(self, level_outputs, r_embed, clip, transformer_options={}): x = level_outputs[0] block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers) for i, (up_block, upscaler, repmap) in enumerate(block_group): @@ -216,7 +216,7 @@ class StageB(nn.Module): elif isinstance(block, AttnBlock) or ( hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module, AttnBlock)): - x = block(x, clip) + x = block(x, clip, transformer_options=transformer_options) elif isinstance(block, TimestepBlock) or ( hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module, TimestepBlock)): @@ -228,7 +228,7 @@ class StageB(nn.Module): x = upscaler(x) return x - def forward(self, x, r, effnet, clip, pixels=None, **kwargs): + def forward(self, x, r, effnet, clip, pixels=None, transformer_options={}, **kwargs): if pixels is None: pixels = x.new_zeros(x.size(0), 3, 8, 8) @@ -245,8 +245,8 @@ class StageB(nn.Module): nn.functional.interpolate(effnet, size=x.shape[-2:], mode='bilinear', align_corners=True)) x = x + nn.functional.interpolate(self.pixels_mapper(pixels), size=x.shape[-2:], mode='bilinear', align_corners=True) - level_outputs = self._down_encode(x, r_embed, clip) - x = self._up_decode(level_outputs, r_embed, clip) + level_outputs = self._down_encode(x, r_embed, clip, transformer_options=transformer_options) + x = self._up_decode(level_outputs, r_embed, clip, transformer_options=transformer_options) return self.clf(x) def update_weights_ema(self, src_model, beta=0.999): diff --git a/comfy/ldm/cascade/stage_c.py b/comfy/ldm/cascade/stage_c.py index b952d0349..ebc4434e2 100644 --- a/comfy/ldm/cascade/stage_c.py +++ b/comfy/ldm/cascade/stage_c.py @@ -182,7 +182,7 @@ class StageC(nn.Module): clip = self.clip_norm(clip) return clip - def _down_encode(self, x, r_embed, clip, cnet=None): + def _down_encode(self, x, r_embed, clip, cnet=None, transformer_options={}): level_outputs = [] block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers) for down_block, downscaler, repmap in block_group: @@ -201,7 +201,7 @@ class StageC(nn.Module): elif isinstance(block, AttnBlock) or ( hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module, AttnBlock)): - x = block(x, clip) + x = block(x, clip, transformer_options=transformer_options) elif isinstance(block, TimestepBlock) or ( hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module, TimestepBlock)): @@ -213,7 +213,7 @@ class StageC(nn.Module): level_outputs.insert(0, x) return level_outputs - def _up_decode(self, level_outputs, r_embed, clip, cnet=None): + def _up_decode(self, level_outputs, r_embed, clip, cnet=None, transformer_options={}): x = level_outputs[0] block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers) for i, (up_block, upscaler, repmap) in enumerate(block_group): @@ -235,7 +235,7 @@ class StageC(nn.Module): elif isinstance(block, AttnBlock) or ( hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module, AttnBlock)): - x = block(x, clip) + x = block(x, clip, transformer_options=transformer_options) elif isinstance(block, TimestepBlock) or ( hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module, TimestepBlock)): @@ -247,7 +247,7 @@ class StageC(nn.Module): x = upscaler(x) return x - def forward(self, x, r, clip_text, clip_text_pooled, clip_img, control=None, **kwargs): + def forward(self, x, r, clip_text, clip_text_pooled, clip_img, control=None, transformer_options={}, **kwargs): # Process the conditioning embeddings r_embed = self.gen_r_embedding(r).to(dtype=x.dtype) for c in self.t_conds: @@ -262,8 +262,8 @@ class StageC(nn.Module): # Model Blocks x = self.embedding(x) - level_outputs = self._down_encode(x, r_embed, clip, cnet) - x = self._up_decode(level_outputs, r_embed, clip, cnet) + level_outputs = self._down_encode(x, r_embed, clip, cnet, transformer_options=transformer_options) + x = self._up_decode(level_outputs, r_embed, clip, cnet, transformer_options=transformer_options) return self.clf(x) def update_weights_ema(self, src_model, beta=0.999): diff --git a/comfy/ldm/chroma/layers.py b/comfy/ldm/chroma/layers.py index 2a0dec606..fc7110cce 100644 --- a/comfy/ldm/chroma/layers.py +++ b/comfy/ldm/chroma/layers.py @@ -76,7 +76,7 @@ class DoubleStreamBlock(nn.Module): ) self.flipped_img_txt = flipped_img_txt - def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None): + def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}): (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec # prepare image for attention @@ -95,7 +95,7 @@ class DoubleStreamBlock(nn.Module): attn = attention(torch.cat((txt_q, img_q), dim=2), torch.cat((txt_k, img_k), dim=2), torch.cat((txt_v, img_v), dim=2), - pe=pe, mask=attn_mask) + pe=pe, mask=attn_mask, transformer_options=transformer_options) txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :] @@ -148,7 +148,7 @@ class SingleStreamBlock(nn.Module): self.mlp_act = nn.GELU(approximate="tanh") - def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None) -> Tensor: + def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}) -> Tensor: mod = vec x_mod = torch.addcmul(mod.shift, 1 + mod.scale, self.pre_norm(x)) qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) @@ -157,7 +157,7 @@ class SingleStreamBlock(nn.Module): q, k = self.norm(q, k, v) # compute attention - attn = attention(q, k, v, pe=pe, mask=attn_mask) + attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) # compute activation in mlp stream, cat again and run second linear layer output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) x.addcmul_(mod.gate, output) diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py index 5cff44dc8..4f709f87d 100644 --- a/comfy/ldm/chroma/model.py +++ b/comfy/ldm/chroma/model.py @@ -193,14 +193,16 @@ class Chroma(nn.Module): txt=args["txt"], vec=args["vec"], pe=args["pe"], - attn_mask=args.get("attn_mask")) + attn_mask=args.get("attn_mask"), + transformer_options=args.get("transformer_options")) return out out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": double_mod, "pe": pe, - "attn_mask": attn_mask}, + "attn_mask": attn_mask, + "transformer_options": transformer_options}, {"original_block": block_wrap}) txt = out["txt"] img = out["img"] @@ -209,7 +211,8 @@ class Chroma(nn.Module): txt=txt, vec=double_mod, pe=pe, - attn_mask=attn_mask) + attn_mask=attn_mask, + transformer_options=transformer_options) if control is not None: # Controlnet control_i = control.get("input") @@ -229,17 +232,19 @@ class Chroma(nn.Module): out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], - attn_mask=args.get("attn_mask")) + attn_mask=args.get("attn_mask"), + transformer_options=args.get("transformer_options")) return out out = blocks_replace[("single_block", i)]({"img": img, "vec": single_mod, "pe": pe, - "attn_mask": attn_mask}, + "attn_mask": attn_mask, + "transformer_options": transformer_options}, {"original_block": block_wrap}) img = out["img"] else: - img = block(img, vec=single_mod, pe=pe, attn_mask=attn_mask) + img = block(img, vec=single_mod, pe=pe, attn_mask=attn_mask, transformer_options=transformer_options) if control is not None: # Controlnet control_o = control.get("output") diff --git a/comfy/ldm/cosmos/blocks.py b/comfy/ldm/cosmos/blocks.py index 5c4356a3f..afb43d469 100644 --- a/comfy/ldm/cosmos/blocks.py +++ b/comfy/ldm/cosmos/blocks.py @@ -176,6 +176,7 @@ class Attention(nn.Module): context=None, mask=None, rope_emb=None, + transformer_options={}, **kwargs, ): """ @@ -184,7 +185,7 @@ class Attention(nn.Module): context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None """ q, k, v = self.cal_qkv(x, context, mask, rope_emb=rope_emb, **kwargs) - out = optimized_attention(q, k, v, self.heads, skip_reshape=True, mask=mask, skip_output_reshape=True) + out = optimized_attention(q, k, v, self.heads, skip_reshape=True, mask=mask, skip_output_reshape=True, transformer_options=transformer_options) del q, k, v out = rearrange(out, " b n s c -> s b (n c)") return self.to_out(out) @@ -546,6 +547,7 @@ class VideoAttn(nn.Module): context: Optional[torch.Tensor] = None, crossattn_mask: Optional[torch.Tensor] = None, rope_emb_L_1_1_D: Optional[torch.Tensor] = None, + transformer_options: Optional[dict] = {}, ) -> torch.Tensor: """ Forward pass for video attention. @@ -571,6 +573,7 @@ class VideoAttn(nn.Module): context_M_B_D, crossattn_mask, rope_emb=rope_emb_L_1_1_D, + transformer_options=transformer_options, ) x_T_H_W_B_D = rearrange(x_THW_B_D, "(t h w) b d -> t h w b d", h=H, w=W) return x_T_H_W_B_D @@ -665,6 +668,7 @@ class DITBuildingBlock(nn.Module): crossattn_mask: Optional[torch.Tensor] = None, rope_emb_L_1_1_D: Optional[torch.Tensor] = None, adaln_lora_B_3D: Optional[torch.Tensor] = None, + transformer_options: Optional[dict] = {}, ) -> torch.Tensor: """ Forward pass for dynamically configured blocks with adaptive normalization. @@ -702,6 +706,7 @@ class DITBuildingBlock(nn.Module): adaln_norm_state(self.norm_state, x, scale_1_1_1_B_D, shift_1_1_1_B_D), context=None, rope_emb_L_1_1_D=rope_emb_L_1_1_D, + transformer_options=transformer_options, ) elif self.block_type in ["cross_attn", "ca"]: x = x + gate_1_1_1_B_D * self.block( @@ -709,6 +714,7 @@ class DITBuildingBlock(nn.Module): context=crossattn_emb, crossattn_mask=crossattn_mask, rope_emb_L_1_1_D=rope_emb_L_1_1_D, + transformer_options=transformer_options, ) else: raise ValueError(f"Unknown block type: {self.block_type}") @@ -784,6 +790,7 @@ class GeneralDITTransformerBlock(nn.Module): crossattn_mask: Optional[torch.Tensor] = None, rope_emb_L_1_1_D: Optional[torch.Tensor] = None, adaln_lora_B_3D: Optional[torch.Tensor] = None, + transformer_options: Optional[dict] = {}, ) -> torch.Tensor: for block in self.blocks: x = block( @@ -793,5 +800,6 @@ class GeneralDITTransformerBlock(nn.Module): crossattn_mask, rope_emb_L_1_1_D=rope_emb_L_1_1_D, adaln_lora_B_3D=adaln_lora_B_3D, + transformer_options=transformer_options, ) return x diff --git a/comfy/ldm/cosmos/model.py b/comfy/ldm/cosmos/model.py index 53698b758..52ef7ef43 100644 --- a/comfy/ldm/cosmos/model.py +++ b/comfy/ldm/cosmos/model.py @@ -520,6 +520,7 @@ class GeneralDIT(nn.Module): x.shape == extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape ), f"{x.shape} != {extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape} {original_shape}" + transformer_options = kwargs.get("transformer_options", {}) for _, block in self.blocks.items(): assert ( self.blocks["block0"].x_format == block.x_format @@ -534,6 +535,7 @@ class GeneralDIT(nn.Module): crossattn_mask, rope_emb_L_1_1_D=rope_emb_L_1_1_D, adaln_lora_B_3D=adaln_lora_B_3D, + transformer_options=transformer_options, ) x_B_T_H_W_D = rearrange(x, "T H W B D -> B T H W D") diff --git a/comfy/ldm/cosmos/predict2.py b/comfy/ldm/cosmos/predict2.py index fcc83ba76..07a4fc79f 100644 --- a/comfy/ldm/cosmos/predict2.py +++ b/comfy/ldm/cosmos/predict2.py @@ -44,7 +44,7 @@ class GPT2FeedForward(nn.Module): return x -def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor, v_B_S_H_D: torch.Tensor) -> torch.Tensor: +def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor, v_B_S_H_D: torch.Tensor, transformer_options: Optional[dict] = {}) -> torch.Tensor: """Computes multi-head attention using PyTorch's native implementation. This function provides a PyTorch backend alternative to Transformer Engine's attention operation. @@ -71,7 +71,7 @@ def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor, v_B_S_H q_B_H_S_D = rearrange(q_B_S_H_D, "b ... h k -> b h ... k").view(in_q_shape[0], in_q_shape[-2], -1, in_q_shape[-1]) k_B_H_S_D = rearrange(k_B_S_H_D, "b ... h v -> b h ... v").view(in_k_shape[0], in_k_shape[-2], -1, in_k_shape[-1]) v_B_H_S_D = rearrange(v_B_S_H_D, "b ... h v -> b h ... v").view(in_k_shape[0], in_k_shape[-2], -1, in_k_shape[-1]) - return optimized_attention(q_B_H_S_D, k_B_H_S_D, v_B_H_S_D, in_q_shape[-2], skip_reshape=True) + return optimized_attention(q_B_H_S_D, k_B_H_S_D, v_B_H_S_D, in_q_shape[-2], skip_reshape=True, transformer_options=transformer_options) class Attention(nn.Module): @@ -180,8 +180,8 @@ class Attention(nn.Module): return q, k, v - def compute_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor: - result = self.attn_op(q, k, v) # [B, S, H, D] + def compute_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, transformer_options: Optional[dict] = {}) -> torch.Tensor: + result = self.attn_op(q, k, v, transformer_options=transformer_options) # [B, S, H, D] return self.output_dropout(self.output_proj(result)) def forward( @@ -189,6 +189,7 @@ class Attention(nn.Module): x: torch.Tensor, context: Optional[torch.Tensor] = None, rope_emb: Optional[torch.Tensor] = None, + transformer_options: Optional[dict] = {}, ) -> torch.Tensor: """ Args: @@ -196,7 +197,7 @@ class Attention(nn.Module): context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None """ q, k, v = self.compute_qkv(x, context, rope_emb=rope_emb) - return self.compute_attention(q, k, v) + return self.compute_attention(q, k, v, transformer_options=transformer_options) class Timesteps(nn.Module): @@ -459,6 +460,7 @@ class Block(nn.Module): rope_emb_L_1_1_D: Optional[torch.Tensor] = None, adaln_lora_B_T_3D: Optional[torch.Tensor] = None, extra_per_block_pos_emb: Optional[torch.Tensor] = None, + transformer_options: Optional[dict] = {}, ) -> torch.Tensor: if extra_per_block_pos_emb is not None: x_B_T_H_W_D = x_B_T_H_W_D + extra_per_block_pos_emb @@ -512,6 +514,7 @@ class Block(nn.Module): rearrange(normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"), None, rope_emb=rope_emb_L_1_1_D, + transformer_options=transformer_options, ), "b (t h w) d -> b t h w d", t=T, @@ -525,6 +528,7 @@ class Block(nn.Module): layer_norm_cross_attn: Callable, _scale_cross_attn_B_T_1_1_D: torch.Tensor, _shift_cross_attn_B_T_1_1_D: torch.Tensor, + transformer_options: Optional[dict] = {}, ) -> torch.Tensor: _normalized_x_B_T_H_W_D = _fn( _x_B_T_H_W_D, layer_norm_cross_attn, _scale_cross_attn_B_T_1_1_D, _shift_cross_attn_B_T_1_1_D @@ -534,6 +538,7 @@ class Block(nn.Module): rearrange(_normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"), crossattn_emb, rope_emb=rope_emb_L_1_1_D, + transformer_options=transformer_options, ), "b (t h w) d -> b t h w d", t=T, @@ -547,6 +552,7 @@ class Block(nn.Module): self.layer_norm_cross_attn, scale_cross_attn_B_T_1_1_D, shift_cross_attn_B_T_1_1_D, + transformer_options=transformer_options, ) x_B_T_H_W_D = result_B_T_H_W_D * gate_cross_attn_B_T_1_1_D + x_B_T_H_W_D @@ -865,6 +871,7 @@ class MiniTrainDIT(nn.Module): "rope_emb_L_1_1_D": rope_emb_L_1_1_D.unsqueeze(1).unsqueeze(0), "adaln_lora_B_T_3D": adaln_lora_B_T_3D, "extra_per_block_pos_emb": extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D, + "transformer_options": kwargs.get("transformer_options", {}), } for block in self.blocks: x_B_T_H_W_D = block( diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py index 113eb2096..ef21b416b 100644 --- a/comfy/ldm/flux/layers.py +++ b/comfy/ldm/flux/layers.py @@ -159,7 +159,7 @@ class DoubleStreamBlock(nn.Module): ) self.flipped_img_txt = flipped_img_txt - def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None): + def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}): img_mod1, img_mod2 = self.img_mod(vec) txt_mod1, txt_mod2 = self.txt_mod(vec) @@ -182,7 +182,7 @@ class DoubleStreamBlock(nn.Module): attn = attention(torch.cat((img_q, txt_q), dim=2), torch.cat((img_k, txt_k), dim=2), torch.cat((img_v, txt_v), dim=2), - pe=pe, mask=attn_mask) + pe=pe, mask=attn_mask, transformer_options=transformer_options) img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:] else: @@ -190,7 +190,7 @@ class DoubleStreamBlock(nn.Module): attn = attention(torch.cat((txt_q, img_q), dim=2), torch.cat((txt_k, img_k), dim=2), torch.cat((txt_v, img_v), dim=2), - pe=pe, mask=attn_mask) + pe=pe, mask=attn_mask, transformer_options=transformer_options) txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:] @@ -244,7 +244,7 @@ class SingleStreamBlock(nn.Module): self.mlp_act = nn.GELU(approximate="tanh") self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations) - def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None) -> Tensor: + def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None, transformer_options={}) -> Tensor: mod, _ = self.modulation(vec) qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) @@ -252,7 +252,7 @@ class SingleStreamBlock(nn.Module): q, k = self.norm(q, k, v) # compute attention - attn = attention(q, k, v, pe=pe, mask=attn_mask) + attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) # compute activation in mlp stream, cat again and run second linear layer output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) x += apply_mod(output, mod.gate, None, modulation_dims) diff --git a/comfy/ldm/flux/math.py b/comfy/ldm/flux/math.py index 3e0978176..4d743cda2 100644 --- a/comfy/ldm/flux/math.py +++ b/comfy/ldm/flux/math.py @@ -6,7 +6,7 @@ from comfy.ldm.modules.attention import optimized_attention import comfy.model_management -def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor: +def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor: q_shape = q.shape k_shape = k.shape @@ -17,7 +17,7 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor: k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v) heads = q.shape[1] - x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask) + x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask, transformer_options=transformer_options) return x diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 8ea7d4f57..14f90cea5 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -144,14 +144,16 @@ class Flux(nn.Module): txt=args["txt"], vec=args["vec"], pe=args["pe"], - attn_mask=args.get("attn_mask")) + attn_mask=args.get("attn_mask"), + transformer_options=args.get("transformer_options")) return out out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, - "attn_mask": attn_mask}, + "attn_mask": attn_mask, + "transformer_options": transformer_options}, {"original_block": block_wrap}) txt = out["txt"] img = out["img"] @@ -160,7 +162,8 @@ class Flux(nn.Module): txt=txt, vec=vec, pe=pe, - attn_mask=attn_mask) + attn_mask=attn_mask, + transformer_options=transformer_options) if control is not None: # Controlnet control_i = control.get("input") @@ -181,17 +184,19 @@ class Flux(nn.Module): out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], - attn_mask=args.get("attn_mask")) + attn_mask=args.get("attn_mask"), + transformer_options=args.get("transformer_options")) return out out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, - "attn_mask": attn_mask}, + "attn_mask": attn_mask, + "transformer_options": transformer_options}, {"original_block": block_wrap}) img = out["img"] else: - img = block(img, vec=vec, pe=pe, attn_mask=attn_mask) + img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, transformer_options=transformer_options) if control is not None: # Controlnet control_o = control.get("output") diff --git a/comfy/ldm/genmo/joint_model/asymm_models_joint.py b/comfy/ldm/genmo/joint_model/asymm_models_joint.py index 366a8b713..5c1bb4d42 100644 --- a/comfy/ldm/genmo/joint_model/asymm_models_joint.py +++ b/comfy/ldm/genmo/joint_model/asymm_models_joint.py @@ -109,6 +109,7 @@ class AsymmetricAttention(nn.Module): scale_x: torch.Tensor, # (B, dim_x), modulation for pre-RMSNorm. scale_y: torch.Tensor, # (B, dim_y), modulation for pre-RMSNorm. crop_y, + transformer_options={}, **rope_rotation, ) -> Tuple[torch.Tensor, torch.Tensor]: rope_cos = rope_rotation.get("rope_cos") @@ -143,7 +144,7 @@ class AsymmetricAttention(nn.Module): xy = optimized_attention(q, k, - v, self.num_heads, skip_reshape=True) + v, self.num_heads, skip_reshape=True, transformer_options=transformer_options) x, y = torch.tensor_split(xy, (q_x.shape[1],), dim=1) x = self.proj_x(x) @@ -224,6 +225,7 @@ class AsymmetricJointBlock(nn.Module): x: torch.Tensor, c: torch.Tensor, y: torch.Tensor, + transformer_options={}, **attn_kwargs, ): """Forward pass of a block. @@ -256,6 +258,7 @@ class AsymmetricJointBlock(nn.Module): y, scale_x=scale_msa_x, scale_y=scale_msa_y, + transformer_options=transformer_options, **attn_kwargs, ) @@ -524,10 +527,11 @@ class AsymmDiTJoint(nn.Module): args["txt"], rope_cos=args["rope_cos"], rope_sin=args["rope_sin"], - crop_y=args["num_tokens"] + crop_y=args["num_tokens"], + transformer_options=args["transformer_options"] ) return out - out = blocks_replace[("double_block", i)]({"img": x, "txt": y_feat, "vec": c, "rope_cos": rope_cos, "rope_sin": rope_sin, "num_tokens": num_tokens}, {"original_block": block_wrap}) + out = blocks_replace[("double_block", i)]({"img": x, "txt": y_feat, "vec": c, "rope_cos": rope_cos, "rope_sin": rope_sin, "num_tokens": num_tokens, "transformer_options": transformer_options}, {"original_block": block_wrap}) y_feat = out["txt"] x = out["img"] else: @@ -538,6 +542,7 @@ class AsymmDiTJoint(nn.Module): rope_cos=rope_cos, rope_sin=rope_sin, crop_y=num_tokens, + transformer_options=transformer_options, ) # (B, M, D), (B, L, D) del y_feat # Final layers don't use dense text features. diff --git a/comfy/ldm/hidream/model.py b/comfy/ldm/hidream/model.py index ae49cf945..28d81c79e 100644 --- a/comfy/ldm/hidream/model.py +++ b/comfy/ldm/hidream/model.py @@ -72,8 +72,8 @@ class TimestepEmbed(nn.Module): return t_emb -def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor): - return optimized_attention(query.view(query.shape[0], -1, query.shape[-1] * query.shape[-2]), key.view(key.shape[0], -1, key.shape[-1] * key.shape[-2]), value.view(value.shape[0], -1, value.shape[-1] * value.shape[-2]), query.shape[2]) +def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, transformer_options={}): + return optimized_attention(query.view(query.shape[0], -1, query.shape[-1] * query.shape[-2]), key.view(key.shape[0], -1, key.shape[-1] * key.shape[-2]), value.view(value.shape[0], -1, value.shape[-1] * value.shape[-2]), query.shape[2], transformer_options=transformer_options) class HiDreamAttnProcessor_flashattn: @@ -86,6 +86,7 @@ class HiDreamAttnProcessor_flashattn: image_tokens_masks: Optional[torch.FloatTensor] = None, text_tokens: Optional[torch.FloatTensor] = None, rope: torch.FloatTensor = None, + transformer_options={}, *args, **kwargs, ) -> torch.FloatTensor: @@ -133,7 +134,7 @@ class HiDreamAttnProcessor_flashattn: query = torch.cat([query_1, query_2], dim=-1) key = torch.cat([key_1, key_2], dim=-1) - hidden_states = attention(query, key, value) + hidden_states = attention(query, key, value, transformer_options=transformer_options) if not attn.single: hidden_states_i, hidden_states_t = torch.split(hidden_states, [num_image_tokens, num_text_tokens], dim=1) @@ -199,6 +200,7 @@ class HiDreamAttention(nn.Module): image_tokens_masks: torch.FloatTensor = None, norm_text_tokens: torch.FloatTensor = None, rope: torch.FloatTensor = None, + transformer_options={}, ) -> torch.Tensor: return self.processor( self, @@ -206,6 +208,7 @@ class HiDreamAttention(nn.Module): image_tokens_masks = image_tokens_masks, text_tokens = norm_text_tokens, rope = rope, + transformer_options=transformer_options, ) @@ -406,7 +409,7 @@ class HiDreamImageSingleTransformerBlock(nn.Module): text_tokens: Optional[torch.FloatTensor] = None, adaln_input: Optional[torch.FloatTensor] = None, rope: torch.FloatTensor = None, - + transformer_options={}, ) -> torch.FloatTensor: wtype = image_tokens.dtype shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i = \ @@ -419,6 +422,7 @@ class HiDreamImageSingleTransformerBlock(nn.Module): norm_image_tokens, image_tokens_masks, rope = rope, + transformer_options=transformer_options, ) image_tokens = gate_msa_i * attn_output_i + image_tokens @@ -483,6 +487,7 @@ class HiDreamImageTransformerBlock(nn.Module): text_tokens: Optional[torch.FloatTensor] = None, adaln_input: Optional[torch.FloatTensor] = None, rope: torch.FloatTensor = None, + transformer_options={}, ) -> torch.FloatTensor: wtype = image_tokens.dtype shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i, \ @@ -500,6 +505,7 @@ class HiDreamImageTransformerBlock(nn.Module): image_tokens_masks, norm_text_tokens, rope = rope, + transformer_options=transformer_options, ) image_tokens = gate_msa_i * attn_output_i + image_tokens @@ -550,6 +556,7 @@ class HiDreamImageBlock(nn.Module): text_tokens: Optional[torch.FloatTensor] = None, adaln_input: torch.FloatTensor = None, rope: torch.FloatTensor = None, + transformer_options={}, ) -> torch.FloatTensor: return self.block( image_tokens, @@ -557,6 +564,7 @@ class HiDreamImageBlock(nn.Module): text_tokens, adaln_input, rope, + transformer_options=transformer_options, ) @@ -786,6 +794,7 @@ class HiDreamImageTransformer2DModel(nn.Module): text_tokens = cur_encoder_hidden_states, adaln_input = adaln_input, rope = rope, + transformer_options=transformer_options, ) initial_encoder_hidden_states = initial_encoder_hidden_states[:, :initial_encoder_hidden_states_seq_len] block_id += 1 @@ -809,6 +818,7 @@ class HiDreamImageTransformer2DModel(nn.Module): text_tokens=None, adaln_input=adaln_input, rope=rope, + transformer_options=transformer_options, ) hidden_states = hidden_states[:, :hidden_states_seq_len] block_id += 1 diff --git a/comfy/ldm/hunyuan3d/model.py b/comfy/ldm/hunyuan3d/model.py index 0fa5e78c1..4991b1645 100644 --- a/comfy/ldm/hunyuan3d/model.py +++ b/comfy/ldm/hunyuan3d/model.py @@ -99,14 +99,16 @@ class Hunyuan3Dv2(nn.Module): txt=args["txt"], vec=args["vec"], pe=args["pe"], - attn_mask=args.get("attn_mask")) + attn_mask=args.get("attn_mask"), + transformer_options=args["transformer_options"]) return out out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, - "attn_mask": attn_mask}, + "attn_mask": attn_mask, + "transformer_options": transformer_options}, {"original_block": block_wrap}) txt = out["txt"] img = out["img"] @@ -115,7 +117,8 @@ class Hunyuan3Dv2(nn.Module): txt=txt, vec=vec, pe=pe, - attn_mask=attn_mask) + attn_mask=attn_mask, + transformer_options=transformer_options) img = torch.cat((txt, img), 1) @@ -126,17 +129,19 @@ class Hunyuan3Dv2(nn.Module): out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], - attn_mask=args.get("attn_mask")) + attn_mask=args.get("attn_mask"), + transformer_options=args["transformer_options"]) return out out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, - "attn_mask": attn_mask}, + "attn_mask": attn_mask, + "transformer_options": transformer_options}, {"original_block": block_wrap}) img = out["img"] else: - img = block(img, vec=vec, pe=pe, attn_mask=attn_mask) + img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, transformer_options=transformer_options) img = img[:, txt.shape[1]:, ...] img = self.final_layer(img, vec) diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py index ca86b8bb1..5132e6c07 100644 --- a/comfy/ldm/hunyuan_video/model.py +++ b/comfy/ldm/hunyuan_video/model.py @@ -80,13 +80,13 @@ class TokenRefinerBlock(nn.Module): operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), ) - def forward(self, x, c, mask): + def forward(self, x, c, mask, transformer_options={}): mod1, mod2 = self.adaLN_modulation(c).chunk(2, dim=1) norm_x = self.norm1(x) qkv = self.self_attn.qkv(norm_x) q, k, v = qkv.reshape(qkv.shape[0], qkv.shape[1], 3, self.heads, -1).permute(2, 0, 3, 1, 4) - attn = optimized_attention(q, k, v, self.heads, mask=mask, skip_reshape=True) + attn = optimized_attention(q, k, v, self.heads, mask=mask, skip_reshape=True, transformer_options=transformer_options) x = x + self.self_attn.proj(attn) * mod1.unsqueeze(1) x = x + self.mlp(self.norm2(x)) * mod2.unsqueeze(1) @@ -117,14 +117,14 @@ class IndividualTokenRefiner(nn.Module): ] ) - def forward(self, x, c, mask): + def forward(self, x, c, mask, transformer_options={}): m = None if mask is not None: m = mask.view(mask.shape[0], 1, 1, mask.shape[1]).repeat(1, 1, mask.shape[1], 1) m = m + m.transpose(2, 3) for block in self.blocks: - x = block(x, c, m) + x = block(x, c, m, transformer_options=transformer_options) return x @@ -152,6 +152,7 @@ class TokenRefiner(nn.Module): x, timesteps, mask, + transformer_options={}, ): t = self.t_embedder(timestep_embedding(timesteps, 256, time_factor=1.0).to(x.dtype)) # m = mask.float().unsqueeze(-1) @@ -160,7 +161,7 @@ class TokenRefiner(nn.Module): c = t + self.c_embedder(c.to(x.dtype)) x = self.input_embedder(x) - x = self.individual_token_refiner(x, c, mask) + x = self.individual_token_refiner(x, c, mask, transformer_options=transformer_options) return x @@ -328,7 +329,7 @@ class HunyuanVideo(nn.Module): if txt_mask is not None and not torch.is_floating_point(txt_mask): txt_mask = (txt_mask - 1).to(img.dtype) * torch.finfo(img.dtype).max - txt = self.txt_in(txt, timesteps, txt_mask) + txt = self.txt_in(txt, timesteps, txt_mask, transformer_options=transformer_options) if self.byt5_in is not None and txt_byt5 is not None: txt_byt5 = self.byt5_in(txt_byt5) @@ -352,14 +353,14 @@ class HunyuanVideo(nn.Module): if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} - out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims_img=args["modulation_dims_img"], modulation_dims_txt=args["modulation_dims_txt"]) + out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims_img=args["modulation_dims_img"], modulation_dims_txt=args["modulation_dims_txt"], transformer_options=args["transformer_options"]) return out - out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims_img': modulation_dims, 'modulation_dims_txt': modulation_dims_txt}, {"original_block": block_wrap}) + out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims_img': modulation_dims, 'modulation_dims_txt': modulation_dims_txt, 'transformer_options': transformer_options}, {"original_block": block_wrap}) txt = out["txt"] img = out["img"] else: - img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims_img=modulation_dims, modulation_dims_txt=modulation_dims_txt) + img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims_img=modulation_dims, modulation_dims_txt=modulation_dims_txt, transformer_options=transformer_options) if control is not None: # Controlnet control_i = control.get("input") @@ -374,13 +375,13 @@ class HunyuanVideo(nn.Module): if ("single_block", i) in blocks_replace: def block_wrap(args): out = {} - out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims=args["modulation_dims"]) + out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims=args["modulation_dims"], transformer_options=args["transformer_options"]) return out - out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims': modulation_dims}, {"original_block": block_wrap}) + out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims': modulation_dims, 'transformer_options': transformer_options}, {"original_block": block_wrap}) img = out["img"] else: - img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims) + img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims, transformer_options=transformer_options) if control is not None: # Controlnet control_o = control.get("output") diff --git a/comfy/ldm/lightricks/model.py b/comfy/ldm/lightricks/model.py index aa2ea62b1..def365ba7 100644 --- a/comfy/ldm/lightricks/model.py +++ b/comfy/ldm/lightricks/model.py @@ -271,7 +271,7 @@ class CrossAttention(nn.Module): self.to_out = nn.Sequential(operations.Linear(inner_dim, query_dim, dtype=dtype, device=device), nn.Dropout(dropout)) - def forward(self, x, context=None, mask=None, pe=None): + def forward(self, x, context=None, mask=None, pe=None, transformer_options={}): q = self.to_q(x) context = x if context is None else context k = self.to_k(context) @@ -285,9 +285,9 @@ class CrossAttention(nn.Module): k = apply_rotary_emb(k, pe) if mask is None: - out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision) + out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision, transformer_options=transformer_options) else: - out = comfy.ldm.modules.attention.optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision) + out = comfy.ldm.modules.attention.optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision, transformer_options=transformer_options) return self.to_out(out) @@ -303,12 +303,12 @@ class BasicTransformerBlock(nn.Module): self.scale_shift_table = nn.Parameter(torch.empty(6, dim, device=device, dtype=dtype)) - def forward(self, x, context=None, attention_mask=None, timestep=None, pe=None): + def forward(self, x, context=None, attention_mask=None, timestep=None, pe=None, transformer_options={}): shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None, None].to(device=x.device, dtype=x.dtype) + timestep.reshape(x.shape[0], timestep.shape[1], self.scale_shift_table.shape[0], -1)).unbind(dim=2) - x += self.attn1(comfy.ldm.common_dit.rms_norm(x) * (1 + scale_msa) + shift_msa, pe=pe) * gate_msa + x += self.attn1(comfy.ldm.common_dit.rms_norm(x) * (1 + scale_msa) + shift_msa, pe=pe, transformer_options=transformer_options) * gate_msa - x += self.attn2(x, context=context, mask=attention_mask) + x += self.attn2(x, context=context, mask=attention_mask, transformer_options=transformer_options) y = comfy.ldm.common_dit.rms_norm(x) * (1 + scale_mlp) + shift_mlp x += self.ff(y) * gate_mlp @@ -479,10 +479,10 @@ class LTXVModel(torch.nn.Module): if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} - out["img"] = block(args["img"], context=args["txt"], attention_mask=args["attention_mask"], timestep=args["vec"], pe=args["pe"]) + out["img"] = block(args["img"], context=args["txt"], attention_mask=args["attention_mask"], timestep=args["vec"], pe=args["pe"], transformer_options=args["transformer_options"]) return out - out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "attention_mask": attention_mask, "vec": timestep, "pe": pe}, {"original_block": block_wrap}) + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "attention_mask": attention_mask, "vec": timestep, "pe": pe, "transformer_options": transformer_options}, {"original_block": block_wrap}) x = out["img"] else: x = block( @@ -490,7 +490,8 @@ class LTXVModel(torch.nn.Module): context=context, attention_mask=attention_mask, timestep=timestep, - pe=pe + pe=pe, + transformer_options=transformer_options, ) # 3. Output diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index e08ed817d..f87d98ac0 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -104,6 +104,7 @@ class JointAttention(nn.Module): x: torch.Tensor, x_mask: torch.Tensor, freqs_cis: torch.Tensor, + transformer_options={}, ) -> torch.Tensor: """ @@ -140,7 +141,7 @@ class JointAttention(nn.Module): if n_rep >= 1: xk = xk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3) xv = xv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3) - output = optimized_attention_masked(xq.movedim(1, 2), xk.movedim(1, 2), xv.movedim(1, 2), self.n_local_heads, x_mask, skip_reshape=True) + output = optimized_attention_masked(xq.movedim(1, 2), xk.movedim(1, 2), xv.movedim(1, 2), self.n_local_heads, x_mask, skip_reshape=True, transformer_options=transformer_options) return self.out(output) @@ -268,6 +269,7 @@ class JointTransformerBlock(nn.Module): x_mask: torch.Tensor, freqs_cis: torch.Tensor, adaln_input: Optional[torch.Tensor]=None, + transformer_options={}, ): """ Perform a forward pass through the TransformerBlock. @@ -290,6 +292,7 @@ class JointTransformerBlock(nn.Module): modulate(self.attention_norm1(x), scale_msa), x_mask, freqs_cis, + transformer_options=transformer_options, ) ) x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2( @@ -304,6 +307,7 @@ class JointTransformerBlock(nn.Module): self.attention_norm1(x), x_mask, freqs_cis, + transformer_options=transformer_options, ) ) x = x + self.ffn_norm2( @@ -494,7 +498,7 @@ class NextDiT(nn.Module): return imgs def patchify_and_embed( - self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens + self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens, transformer_options={} ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor]: bsz = len(x) pH = pW = self.patch_size @@ -554,7 +558,7 @@ class NextDiT(nn.Module): # refine context for layer in self.context_refiner: - cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis) + cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis, transformer_options=transformer_options) # refine image flat_x = [] @@ -573,7 +577,7 @@ class NextDiT(nn.Module): padded_img_embed = self.x_embedder(padded_img_embed) padded_img_mask = padded_img_mask.unsqueeze(1) for layer in self.noise_refiner: - padded_img_embed = layer(padded_img_embed, padded_img_mask, img_freqs_cis, t) + padded_img_embed = layer(padded_img_embed, padded_img_mask, img_freqs_cis, t, transformer_options=transformer_options) if cap_mask is not None: mask = torch.zeros(bsz, max_seq_len, dtype=dtype, device=device) @@ -616,12 +620,13 @@ class NextDiT(nn.Module): cap_feats = self.cap_embedder(cap_feats) # (N, L, D) # todo check if able to batchify w.o. redundant compute + transformer_options = kwargs.get("transformer_options", {}) x_is_tensor = isinstance(x, torch.Tensor) - x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens) + x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options) freqs_cis = freqs_cis.to(x.device) for layer in self.layers: - x = layer(x, mask, freqs_cis, adaln_input) + x = layer(x, mask, freqs_cis, adaln_input, transformer_options=transformer_options) x = self.final_layer(x, adaln_input) x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)[:,:,:h,:w] diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index 043df28df..bf2553c37 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -5,8 +5,9 @@ import torch import torch.nn.functional as F from torch import nn, einsum from einops import rearrange, repeat -from typing import Optional +from typing import Optional, Any, Callable, Union import logging +import functools from .diffusionmodules.util import AlphaBlender, timestep_embedding from .sub_quadratic_attention import efficient_dot_product_attention @@ -17,23 +18,45 @@ if model_management.xformers_enabled(): import xformers import xformers.ops -if model_management.sage_attention_enabled(): - try: - from sageattention import sageattn - except ModuleNotFoundError as e: +SAGE_ATTENTION_IS_AVAILABLE = False +try: + from sageattention import sageattn + SAGE_ATTENTION_IS_AVAILABLE = True +except ModuleNotFoundError as e: + if model_management.sage_attention_enabled(): if e.name == "sageattention": logging.error(f"\n\nTo use the `--use-sage-attention` feature, the `sageattention` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install sageattention") else: raise e exit(-1) -if model_management.flash_attention_enabled(): - try: - from flash_attn import flash_attn_func - except ModuleNotFoundError: +FLASH_ATTENTION_IS_AVAILABLE = False +try: + from flash_attn import flash_attn_func + FLASH_ATTENTION_IS_AVAILABLE = True +except ModuleNotFoundError: + if model_management.flash_attention_enabled(): logging.error(f"\n\nTo use the `--use-flash-attention` feature, the `flash-attn` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install flash-attn") exit(-1) +REGISTERED_ATTENTION_FUNCTIONS = {} +def register_attention_function(name: str, func: Callable): + # avoid replacing existing functions + if name not in REGISTERED_ATTENTION_FUNCTIONS: + REGISTERED_ATTENTION_FUNCTIONS[name] = func + else: + logging.warning(f"Attention function {name} already registered, skipping registration.") + +def get_attention_function(name: str, default: Any=...) -> Union[Callable, None]: + if name == "optimized": + return optimized_attention + elif name not in REGISTERED_ATTENTION_FUNCTIONS: + if default is ...: + raise KeyError(f"Attention function {name} not found.") + else: + return default + return REGISTERED_ATTENTION_FUNCTIONS[name] + from comfy.cli_args import args import comfy.ops ops = comfy.ops.disable_weight_init @@ -91,7 +114,27 @@ class FeedForward(nn.Module): def Normalize(in_channels, dtype=None, device=None): return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype, device=device) -def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False): + +def wrap_attn(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + remove_attn_wrapper_key = False + try: + if "_inside_attn_wrapper" not in kwargs: + transformer_options = kwargs.get("transformer_options", None) + remove_attn_wrapper_key = True + kwargs["_inside_attn_wrapper"] = True + if transformer_options is not None: + if "optimized_attention_override" in transformer_options: + return transformer_options["optimized_attention_override"](func, *args, **kwargs) + return func(*args, **kwargs) + finally: + if remove_attn_wrapper_key: + del kwargs["_inside_attn_wrapper"] + return wrapper + +@wrap_attn +def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs): attn_precision = get_attn_precision(attn_precision, q.dtype) if skip_reshape: @@ -159,8 +202,8 @@ def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape ) return out - -def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False): +@wrap_attn +def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs): attn_precision = get_attn_precision(attn_precision, query.dtype) if skip_reshape: @@ -230,7 +273,8 @@ def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None, hidden_states = hidden_states.unflatten(0, (-1, heads)).transpose(1,2).flatten(start_dim=2) return hidden_states -def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False): +@wrap_attn +def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs): attn_precision = get_attn_precision(attn_precision, q.dtype) if skip_reshape: @@ -359,7 +403,8 @@ try: except: pass -def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False): +@wrap_attn +def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs): b = q.shape[0] dim_head = q.shape[-1] # check to make sure xformers isn't broken @@ -374,7 +419,7 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh disabled_xformers = True if disabled_xformers: - return attention_pytorch(q, k, v, heads, mask, skip_reshape=skip_reshape) + return attention_pytorch(q, k, v, heads, mask, skip_reshape=skip_reshape, **kwargs) if skip_reshape: # b h k d -> b k h d @@ -427,8 +472,8 @@ else: #TODO: other GPUs ? SDP_BATCH_LIMIT = 2**31 - -def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False): +@wrap_attn +def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs): if skip_reshape: b, _, _, dim_head = q.shape else: @@ -470,8 +515,8 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha ).transpose(1, 2).reshape(-1, q.shape[2], heads * dim_head) return out - -def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False): +@wrap_attn +def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs): if skip_reshape: b, _, _, dim_head = q.shape tensor_layout = "HND" @@ -501,7 +546,7 @@ def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape= lambda t: t.transpose(1, 2), (q, k, v), ) - return attention_pytorch(q, k, v, heads, mask=mask, skip_reshape=True, skip_output_reshape=skip_output_reshape) + return attention_pytorch(q, k, v, heads, mask=mask, skip_reshape=True, skip_output_reshape=skip_output_reshape, **kwargs) if tensor_layout == "HND": if not skip_output_reshape: @@ -534,8 +579,8 @@ except AttributeError as error: dropout_p: float = 0.0, causal: bool = False) -> torch.Tensor: assert False, f"Could not define flash_attn_wrapper: {FLASH_ATTN_ERROR}" - -def attention_flash(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False): +@wrap_attn +def attention_flash(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs): if skip_reshape: b, _, _, dim_head = q.shape else: @@ -597,6 +642,19 @@ else: optimized_attention_masked = optimized_attention + +# register core-supported attention functions +if SAGE_ATTENTION_IS_AVAILABLE: + register_attention_function("sage", attention_sage) +if FLASH_ATTENTION_IS_AVAILABLE: + register_attention_function("flash", attention_flash) +if model_management.xformers_enabled(): + register_attention_function("xformers", attention_xformers) +register_attention_function("pytorch", attention_pytorch) +register_attention_function("sub_quad", attention_sub_quad) +register_attention_function("split", attention_split) + + def optimized_attention_for_device(device, mask=False, small_input=False): if small_input: if model_management.pytorch_attention_enabled(): @@ -629,7 +687,7 @@ class CrossAttention(nn.Module): self.to_out = nn.Sequential(operations.Linear(inner_dim, query_dim, dtype=dtype, device=device), nn.Dropout(dropout)) - def forward(self, x, context=None, value=None, mask=None): + def forward(self, x, context=None, value=None, mask=None, transformer_options={}): q = self.to_q(x) context = default(context, x) k = self.to_k(context) @@ -640,9 +698,9 @@ class CrossAttention(nn.Module): v = self.to_v(context) if mask is None: - out = optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision) + out = optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision, transformer_options=transformer_options) else: - out = optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision) + out = optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision, transformer_options=transformer_options) return self.to_out(out) @@ -746,7 +804,7 @@ class BasicTransformerBlock(nn.Module): n = attn1_replace_patch[block_attn1](n, context_attn1, value_attn1, extra_options) n = self.attn1.to_out(n) else: - n = self.attn1(n, context=context_attn1, value=value_attn1) + n = self.attn1(n, context=context_attn1, value=value_attn1, transformer_options=transformer_options) if "attn1_output_patch" in transformer_patches: patch = transformer_patches["attn1_output_patch"] @@ -786,7 +844,7 @@ class BasicTransformerBlock(nn.Module): n = attn2_replace_patch[block_attn2](n, context_attn2, value_attn2, extra_options) n = self.attn2.to_out(n) else: - n = self.attn2(n, context=context_attn2, value=value_attn2) + n = self.attn2(n, context=context_attn2, value=value_attn2, transformer_options=transformer_options) if "attn2_output_patch" in transformer_patches: patch = transformer_patches["attn2_output_patch"] @@ -1017,7 +1075,7 @@ class SpatialVideoTransformer(SpatialTransformer): B, S, C = x_mix.shape x_mix = rearrange(x_mix, "(b t) s c -> (b s) t c", t=timesteps) - x_mix = mix_block(x_mix, context=time_context) #TODO: transformer_options + x_mix = mix_block(x_mix, context=time_context, transformer_options=transformer_options) x_mix = rearrange( x_mix, "(b s) t c -> (b t) s c", s=S, b=B // timesteps, c=C, t=timesteps ) diff --git a/comfy/ldm/modules/diffusionmodules/mmdit.py b/comfy/ldm/modules/diffusionmodules/mmdit.py index 4d6beba2d..42f406f1a 100644 --- a/comfy/ldm/modules/diffusionmodules/mmdit.py +++ b/comfy/ldm/modules/diffusionmodules/mmdit.py @@ -606,7 +606,7 @@ def block_mixing(*args, use_checkpoint=True, **kwargs): return _block_mixing(*args, **kwargs) -def _block_mixing(context, x, context_block, x_block, c): +def _block_mixing(context, x, context_block, x_block, c, transformer_options={}): context_qkv, context_intermediates = context_block.pre_attention(context, c) if x_block.x_block_self_attn: @@ -622,6 +622,7 @@ def _block_mixing(context, x, context_block, x_block, c): attn = optimized_attention( qkv[0], qkv[1], qkv[2], heads=x_block.attn.num_heads, + transformer_options=transformer_options, ) context_attn, x_attn = ( attn[:, : context_qkv[0].shape[1]], @@ -637,6 +638,7 @@ def _block_mixing(context, x, context_block, x_block, c): attn2 = optimized_attention( x_qkv2[0], x_qkv2[1], x_qkv2[2], heads=x_block.attn2.num_heads, + transformer_options=transformer_options, ) x = x_block.post_attention_x(x_attn, attn2, *x_intermediates) else: @@ -958,10 +960,10 @@ class MMDiT(nn.Module): if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} - out["txt"], out["img"] = self.joint_blocks[i](args["txt"], args["img"], c=args["vec"]) + out["txt"], out["img"] = self.joint_blocks[i](args["txt"], args["img"], c=args["vec"], transformer_options=args["transformer_options"]) return out - out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": c_mod}, {"original_block": block_wrap}) + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": c_mod, "transformer_options": transformer_options}, {"original_block": block_wrap}) context = out["txt"] x = out["img"] else: @@ -970,6 +972,7 @@ class MMDiT(nn.Module): x, c=c_mod, use_checkpoint=self.use_checkpoint, + transformer_options=transformer_options, ) if control is not None: control_o = control.get("output") diff --git a/comfy/ldm/omnigen/omnigen2.py b/comfy/ldm/omnigen/omnigen2.py index 4884449f8..82edc92da 100644 --- a/comfy/ldm/omnigen/omnigen2.py +++ b/comfy/ldm/omnigen/omnigen2.py @@ -120,7 +120,7 @@ class Attention(nn.Module): nn.Dropout(0.0) ) - def forward(self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, image_rotary_emb: Optional[torch.Tensor] = None) -> torch.Tensor: + def forward(self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, image_rotary_emb: Optional[torch.Tensor] = None, transformer_options={}) -> torch.Tensor: batch_size, sequence_length, _ = hidden_states.shape query = self.to_q(hidden_states) @@ -146,7 +146,7 @@ class Attention(nn.Module): key = key.repeat_interleave(self.heads // self.kv_heads, dim=1) value = value.repeat_interleave(self.heads // self.kv_heads, dim=1) - hidden_states = optimized_attention_masked(query, key, value, self.heads, attention_mask, skip_reshape=True) + hidden_states = optimized_attention_masked(query, key, value, self.heads, attention_mask, skip_reshape=True, transformer_options=transformer_options) hidden_states = self.to_out[0](hidden_states) return hidden_states @@ -182,16 +182,16 @@ class OmniGen2TransformerBlock(nn.Module): self.norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device) self.ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device) - def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, image_rotary_emb: torch.Tensor, temb: Optional[torch.Tensor] = None) -> torch.Tensor: + def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, image_rotary_emb: torch.Tensor, temb: Optional[torch.Tensor] = None, transformer_options={}) -> torch.Tensor: if self.modulation: norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb) - attn_output = self.attn(norm_hidden_states, norm_hidden_states, attention_mask, image_rotary_emb) + attn_output = self.attn(norm_hidden_states, norm_hidden_states, attention_mask, image_rotary_emb, transformer_options=transformer_options) hidden_states = hidden_states + gate_msa.unsqueeze(1).tanh() * self.norm2(attn_output) mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1))) hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output) else: norm_hidden_states = self.norm1(hidden_states) - attn_output = self.attn(norm_hidden_states, norm_hidden_states, attention_mask, image_rotary_emb) + attn_output = self.attn(norm_hidden_states, norm_hidden_states, attention_mask, image_rotary_emb, transformer_options=transformer_options) hidden_states = hidden_states + self.norm2(attn_output) mlp_output = self.feed_forward(self.ffn_norm1(hidden_states)) hidden_states = hidden_states + self.ffn_norm2(mlp_output) @@ -390,7 +390,7 @@ class OmniGen2Transformer2DModel(nn.Module): ref_img_sizes, img_sizes, ) - def img_patch_embed_and_refine(self, hidden_states, ref_image_hidden_states, padded_img_mask, padded_ref_img_mask, noise_rotary_emb, ref_img_rotary_emb, l_effective_ref_img_len, l_effective_img_len, temb): + def img_patch_embed_and_refine(self, hidden_states, ref_image_hidden_states, padded_img_mask, padded_ref_img_mask, noise_rotary_emb, ref_img_rotary_emb, l_effective_ref_img_len, l_effective_img_len, temb, transformer_options={}): batch_size = len(hidden_states) hidden_states = self.x_embedder(hidden_states) @@ -405,17 +405,17 @@ class OmniGen2Transformer2DModel(nn.Module): shift += ref_img_len for layer in self.noise_refiner: - hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb) + hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb, transformer_options=transformer_options) if ref_image_hidden_states is not None: for layer in self.ref_image_refiner: - ref_image_hidden_states = layer(ref_image_hidden_states, padded_ref_img_mask, ref_img_rotary_emb, temb) + ref_image_hidden_states = layer(ref_image_hidden_states, padded_ref_img_mask, ref_img_rotary_emb, temb, transformer_options=transformer_options) hidden_states = torch.cat([ref_image_hidden_states, hidden_states], dim=1) return hidden_states - def forward(self, x, timesteps, context, num_tokens, ref_latents=None, attention_mask=None, **kwargs): + def forward(self, x, timesteps, context, num_tokens, ref_latents=None, attention_mask=None, transformer_options={}, **kwargs): B, C, H, W = x.shape hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size)) _, _, H_padded, W_padded = hidden_states.shape @@ -444,7 +444,7 @@ class OmniGen2Transformer2DModel(nn.Module): ) for layer in self.context_refiner: - text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb) + text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb, transformer_options=transformer_options) img_len = hidden_states.shape[1] combined_img_hidden_states = self.img_patch_embed_and_refine( @@ -453,13 +453,14 @@ class OmniGen2Transformer2DModel(nn.Module): noise_rotary_emb, ref_img_rotary_emb, l_effective_ref_img_len, l_effective_img_len, temb, + transformer_options=transformer_options, ) hidden_states = torch.cat([text_hidden_states, combined_img_hidden_states], dim=1) attention_mask = None for layer in self.layers: - hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb) + hidden_states = layer(hidden_states, attention_mask, rotary_emb, temb, transformer_options=transformer_options) hidden_states = self.norm_out(hidden_states, temb) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 04071f31c..b9f60c2b7 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -132,6 +132,7 @@ class Attention(nn.Module): encoder_hidden_states_mask: torch.FloatTensor = None, attention_mask: Optional[torch.FloatTensor] = None, image_rotary_emb: Optional[torch.Tensor] = None, + transformer_options={}, ) -> Tuple[torch.Tensor, torch.Tensor]: seq_txt = encoder_hidden_states.shape[1] @@ -159,7 +160,7 @@ class Attention(nn.Module): joint_key = joint_key.flatten(start_dim=2) joint_value = joint_value.flatten(start_dim=2) - joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, attention_mask) + joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, attention_mask, transformer_options=transformer_options) txt_attn_output = joint_hidden_states[:, :seq_txt, :] img_attn_output = joint_hidden_states[:, seq_txt:, :] @@ -226,6 +227,7 @@ class QwenImageTransformerBlock(nn.Module): encoder_hidden_states_mask: torch.Tensor, temb: torch.Tensor, image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + transformer_options={}, ) -> Tuple[torch.Tensor, torch.Tensor]: img_mod_params = self.img_mod(temb) txt_mod_params = self.txt_mod(temb) @@ -242,6 +244,7 @@ class QwenImageTransformerBlock(nn.Module): encoder_hidden_states=txt_modulated, encoder_hidden_states_mask=encoder_hidden_states_mask, image_rotary_emb=image_rotary_emb, + transformer_options=transformer_options, ) hidden_states = hidden_states + img_gate1 * img_attn_output @@ -434,9 +437,9 @@ class QwenImageTransformer2DModel(nn.Module): if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} - out["txt"], out["img"] = block(hidden_states=args["img"], encoder_hidden_states=args["txt"], encoder_hidden_states_mask=encoder_hidden_states_mask, temb=args["vec"], image_rotary_emb=args["pe"]) + out["txt"], out["img"] = block(hidden_states=args["img"], encoder_hidden_states=args["txt"], encoder_hidden_states_mask=encoder_hidden_states_mask, temb=args["vec"], image_rotary_emb=args["pe"], transformer_options=args["transformer_options"]) return out - out = blocks_replace[("double_block", i)]({"img": hidden_states, "txt": encoder_hidden_states, "vec": temb, "pe": image_rotary_emb}, {"original_block": block_wrap}) + out = blocks_replace[("double_block", i)]({"img": hidden_states, "txt": encoder_hidden_states, "vec": temb, "pe": image_rotary_emb, "transformer_options": transformer_options}, {"original_block": block_wrap}) hidden_states = out["img"] encoder_hidden_states = out["txt"] else: @@ -446,11 +449,12 @@ class QwenImageTransformer2DModel(nn.Module): encoder_hidden_states_mask=encoder_hidden_states_mask, temb=temb, image_rotary_emb=image_rotary_emb, + transformer_options=transformer_options, ) if "double_block" in patches: for p in patches["double_block"]: - out = p({"img": hidden_states, "txt": encoder_hidden_states, "x": x, "block_index": i}) + out = p({"img": hidden_states, "txt": encoder_hidden_states, "x": x, "block_index": i, "transformer_options": transformer_options}) hidden_states = out["img"] encoder_hidden_states = out["txt"] diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 47857dc2b..63472ada2 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -52,7 +52,7 @@ class WanSelfAttention(nn.Module): self.norm_q = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity() self.norm_k = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity() - def forward(self, x, freqs): + def forward(self, x, freqs, transformer_options={}): r""" Args: x(Tensor): Shape [B, L, num_heads, C / num_heads] @@ -75,6 +75,7 @@ class WanSelfAttention(nn.Module): k.view(b, s, n * d), v, heads=self.num_heads, + transformer_options=transformer_options, ) x = self.o(x) @@ -83,7 +84,7 @@ class WanSelfAttention(nn.Module): class WanT2VCrossAttention(WanSelfAttention): - def forward(self, x, context, **kwargs): + def forward(self, x, context, transformer_options={}, **kwargs): r""" Args: x(Tensor): Shape [B, L1, C] @@ -95,7 +96,7 @@ class WanT2VCrossAttention(WanSelfAttention): v = self.v(context) # compute attention - x = optimized_attention(q, k, v, heads=self.num_heads) + x = optimized_attention(q, k, v, heads=self.num_heads, transformer_options=transformer_options) x = self.o(x) return x @@ -116,7 +117,7 @@ class WanI2VCrossAttention(WanSelfAttention): # self.alpha = nn.Parameter(torch.zeros((1, ))) self.norm_k_img = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity() - def forward(self, x, context, context_img_len): + def forward(self, x, context, context_img_len, transformer_options={}): r""" Args: x(Tensor): Shape [B, L1, C] @@ -131,9 +132,9 @@ class WanI2VCrossAttention(WanSelfAttention): v = self.v(context) k_img = self.norm_k_img(self.k_img(context_img)) v_img = self.v_img(context_img) - img_x = optimized_attention(q, k_img, v_img, heads=self.num_heads) + img_x = optimized_attention(q, k_img, v_img, heads=self.num_heads, transformer_options=transformer_options) # compute attention - x = optimized_attention(q, k, v, heads=self.num_heads) + x = optimized_attention(q, k, v, heads=self.num_heads, transformer_options=transformer_options) # output x = x + img_x @@ -206,6 +207,7 @@ class WanAttentionBlock(nn.Module): freqs, context, context_img_len=257, + transformer_options={}, ): r""" Args: @@ -224,12 +226,12 @@ class WanAttentionBlock(nn.Module): # self-attention y = self.self_attn( torch.addcmul(repeat_e(e[0], x), self.norm1(x), 1 + repeat_e(e[1], x)), - freqs) + freqs, transformer_options=transformer_options) x = torch.addcmul(x, y, repeat_e(e[2], x)) # cross-attention & ffn - x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len) + x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len, transformer_options=transformer_options) y = self.ffn(torch.addcmul(repeat_e(e[3], x), self.norm2(x), 1 + repeat_e(e[4], x))) x = torch.addcmul(x, y, repeat_e(e[5], x)) return x @@ -559,12 +561,12 @@ class WanModel(torch.nn.Module): if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} - out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len) + out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"]) return out - out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap}) + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap}) x = out["img"] else: - x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len) + x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options) # head x = self.head(x, e) @@ -742,17 +744,17 @@ class VaceWanModel(WanModel): if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} - out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len) + out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"]) return out - out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap}) + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap}) x = out["img"] else: - x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len) + x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options) ii = self.vace_layers_mapping.get(i, None) if ii is not None: for iii in range(len(c)): - c_skip, c[iii] = self.vace_blocks[ii](c[iii], x=x_orig, e=e0, freqs=freqs, context=context, context_img_len=context_img_len) + c_skip, c[iii] = self.vace_blocks[ii](c[iii], x=x_orig, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options) x += c_skip * vace_strength[iii] del c_skip # head @@ -841,12 +843,12 @@ class CameraWanModel(WanModel): if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} - out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len) + out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"]) return out - out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap}) + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap}) x = out["img"] else: - x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len) + x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options) # head x = self.head(x, e) From a3b04de7004cc19dee9364bd71e62bab05475810 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 12 Sep 2025 16:46:46 -0700 Subject: [PATCH 282/759] Hunyuan refiner vae now works with tiled. (#9836) --- comfy/ldm/hunyuan_video/vae_refiner.py | 1 - comfy/sd.py | 21 +++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/comfy/ldm/hunyuan_video/vae_refiner.py b/comfy/ldm/hunyuan_video/vae_refiner.py index e3fff9bbe..c6f742710 100644 --- a/comfy/ldm/hunyuan_video/vae_refiner.py +++ b/comfy/ldm/hunyuan_video/vae_refiner.py @@ -185,7 +185,6 @@ class Encoder(nn.Module): self.regul = comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer() def forward(self, x): - x = x.unsqueeze(2) x = self.conv_in(x) for stage in self.down: diff --git a/comfy/sd.py b/comfy/sd.py index 02ddc7239..f8f1a89e8 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -412,9 +412,12 @@ class VAE: self.working_dtypes = [torch.bfloat16, torch.float32] elif "decoder.conv_in.conv.weight" in sd and sd['decoder.conv_in.conv.weight'].shape[1] == 32: ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True} - self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1] - self.downscale_ratio = 16 - self.upscale_ratio = 16 + ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1] + self.latent_channels = 64 + self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16) + self.upscale_index_formula = (4, 16, 16) + self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16) + self.downscale_index_formula = (4, 16, 16) self.latent_dim = 3 self.not_video = True self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] @@ -684,8 +687,11 @@ class VAE: self.throw_exception_if_invalid() pixel_samples = self.vae_encode_crop_pixels(pixel_samples) pixel_samples = pixel_samples.movedim(-1, 1) - if not self.not_video and self.latent_dim == 3 and pixel_samples.ndim < 5: - pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) + if self.latent_dim == 3 and pixel_samples.ndim < 5: + if not self.not_video: + pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) + else: + pixel_samples = pixel_samples.unsqueeze(2) try: memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) @@ -719,7 +725,10 @@ class VAE: dims = self.latent_dim pixel_samples = pixel_samples.movedim(-1, 1) if dims == 3: - pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) + if not self.not_video: + pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) + else: + pixel_samples = pixel_samples.unsqueeze(2) memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) # TODO: calculate mem required for tile model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) From 2559dee49202365bc97218b98121e796f57dfcb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Sat, 13 Sep 2025 04:52:58 +0300 Subject: [PATCH 283/759] Support wav2vec base models (#9637) * Support wav2vec base models * trim trailing whitespace * Do interpolation after --- comfy/audio_encoders/audio_encoders.py | 36 ++++++++++- comfy/audio_encoders/wav2vec2.py | 87 +++++++++++++++++++------- 2 files changed, 99 insertions(+), 24 deletions(-) diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py index 538c21bd5..d1ec78f69 100644 --- a/comfy/audio_encoders/audio_encoders.py +++ b/comfy/audio_encoders/audio_encoders.py @@ -11,7 +11,13 @@ class AudioEncoderModel(): self.load_device = comfy.model_management.text_encoder_device() offload_device = comfy.model_management.text_encoder_offload_device() self.dtype = comfy.model_management.text_encoder_dtype(self.load_device) - self.model = Wav2Vec2Model(dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast) + model_config = dict(config) + model_config.update({ + "dtype": self.dtype, + "device": offload_device, + "operations": comfy.ops.manual_cast + }) + self.model = Wav2Vec2Model(**model_config) self.model.eval() self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device) self.model_sample_rate = 16000 @@ -25,7 +31,7 @@ class AudioEncoderModel(): def encode_audio(self, audio, sample_rate): comfy.model_management.load_model_gpu(self.patcher) audio = torchaudio.functional.resample(audio, sample_rate, self.model_sample_rate) - out, all_layers = self.model(audio.to(self.load_device)) + out, all_layers = self.model(audio.to(self.load_device), sr=self.model_sample_rate) outputs = {} outputs["encoded_audio"] = out outputs["encoded_audio_all_layers"] = all_layers @@ -33,8 +39,32 @@ class AudioEncoderModel(): def load_audio_encoder_from_sd(sd, prefix=""): - audio_encoder = AudioEncoderModel(None) sd = comfy.utils.state_dict_prefix_replace(sd, {"wav2vec2.": ""}) + embed_dim = sd["encoder.layer_norm.bias"].shape[0] + if embed_dim == 1024:# large + config = { + "embed_dim": 1024, + "num_heads": 16, + "num_layers": 24, + "conv_norm": True, + "conv_bias": True, + "do_normalize": True, + "do_stable_layer_norm": True + } + elif embed_dim == 768: # base + config = { + "embed_dim": 768, + "num_heads": 12, + "num_layers": 12, + "conv_norm": False, + "conv_bias": False, + "do_normalize": False, # chinese-wav2vec2-base has this False + "do_stable_layer_norm": False + } + else: + raise RuntimeError("ERROR: audio encoder file is invalid or unsupported embed_dim: {}".format(embed_dim)) + + audio_encoder = AudioEncoderModel(config) m, u = audio_encoder.load_sd(sd) if len(m) > 0: logging.warning("missing audio encoder: {}".format(m)) diff --git a/comfy/audio_encoders/wav2vec2.py b/comfy/audio_encoders/wav2vec2.py index de906622a..ef10dcd2a 100644 --- a/comfy/audio_encoders/wav2vec2.py +++ b/comfy/audio_encoders/wav2vec2.py @@ -13,19 +13,49 @@ class LayerNormConv(nn.Module): x = self.conv(x) return torch.nn.functional.gelu(self.layer_norm(x.transpose(-2, -1)).transpose(-2, -1)) +class LayerGroupNormConv(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride, bias=False, dtype=None, device=None, operations=None): + super().__init__() + self.conv = operations.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=bias, device=device, dtype=dtype) + self.layer_norm = operations.GroupNorm(num_groups=out_channels, num_channels=out_channels, affine=True, device=device, dtype=dtype) + + def forward(self, x): + x = self.conv(x) + return torch.nn.functional.gelu(self.layer_norm(x)) + +class ConvNoNorm(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, stride, bias=False, dtype=None, device=None, operations=None): + super().__init__() + self.conv = operations.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, bias=bias, device=device, dtype=dtype) + + def forward(self, x): + x = self.conv(x) + return torch.nn.functional.gelu(x) + class ConvFeatureEncoder(nn.Module): - def __init__(self, conv_dim, dtype=None, device=None, operations=None): + def __init__(self, conv_dim, conv_bias=False, conv_norm=True, dtype=None, device=None, operations=None): super().__init__() - self.conv_layers = nn.ModuleList([ - LayerNormConv(1, conv_dim, kernel_size=10, stride=5, bias=True, device=device, dtype=dtype, operations=operations), - LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations), - LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations), - LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations), - LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=True, device=device, dtype=dtype, operations=operations), - LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations), - LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=True, device=device, dtype=dtype, operations=operations), - ]) + if conv_norm: + self.conv_layers = nn.ModuleList([ + LayerNormConv(1, conv_dim, kernel_size=10, stride=5, bias=True, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations), + LayerNormConv(conv_dim, conv_dim, kernel_size=2, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations), + ]) + else: + self.conv_layers = nn.ModuleList([ + LayerGroupNormConv(1, conv_dim, kernel_size=10, stride=5, bias=conv_bias, device=device, dtype=dtype, operations=operations), + ConvNoNorm(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations), + ConvNoNorm(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations), + ConvNoNorm(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations), + ConvNoNorm(conv_dim, conv_dim, kernel_size=3, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations), + ConvNoNorm(conv_dim, conv_dim, kernel_size=2, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations), + ConvNoNorm(conv_dim, conv_dim, kernel_size=2, stride=2, bias=conv_bias, device=device, dtype=dtype, operations=operations), + ]) def forward(self, x): x = x.unsqueeze(1) @@ -76,6 +106,7 @@ class TransformerEncoder(nn.Module): num_heads=12, num_layers=12, mlp_ratio=4.0, + do_stable_layer_norm=True, dtype=None, device=None, operations=None ): super().__init__() @@ -86,20 +117,25 @@ class TransformerEncoder(nn.Module): embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, + do_stable_layer_norm=do_stable_layer_norm, device=device, dtype=dtype, operations=operations ) for _ in range(num_layers) ]) self.layer_norm = operations.LayerNorm(embed_dim, eps=1e-05, device=device, dtype=dtype) + self.do_stable_layer_norm = do_stable_layer_norm def forward(self, x, mask=None): x = x + self.pos_conv_embed(x) all_x = () + if not self.do_stable_layer_norm: + x = self.layer_norm(x) for layer in self.layers: all_x += (x,) x = layer(x, mask) - x = self.layer_norm(x) + if self.do_stable_layer_norm: + x = self.layer_norm(x) all_x += (x,) return x, all_x @@ -145,6 +181,7 @@ class TransformerEncoderLayer(nn.Module): embed_dim=768, num_heads=12, mlp_ratio=4.0, + do_stable_layer_norm=True, dtype=None, device=None, operations=None ): super().__init__() @@ -154,15 +191,19 @@ class TransformerEncoderLayer(nn.Module): self.layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype) self.feed_forward = FeedForward(embed_dim, mlp_ratio, device=device, dtype=dtype, operations=operations) self.final_layer_norm = operations.LayerNorm(embed_dim, device=device, dtype=dtype) + self.do_stable_layer_norm = do_stable_layer_norm def forward(self, x, mask=None): residual = x - x = self.layer_norm(x) + if self.do_stable_layer_norm: + x = self.layer_norm(x) x = self.attention(x, mask=mask) x = residual + x - - x = x + self.feed_forward(self.final_layer_norm(x)) - return x + if not self.do_stable_layer_norm: + x = self.layer_norm(x) + return self.final_layer_norm(x + self.feed_forward(x)) + else: + return x + self.feed_forward(self.final_layer_norm(x)) class Wav2Vec2Model(nn.Module): @@ -174,34 +215,38 @@ class Wav2Vec2Model(nn.Module): final_dim=256, num_heads=16, num_layers=24, + conv_norm=True, + conv_bias=True, + do_normalize=True, + do_stable_layer_norm=True, dtype=None, device=None, operations=None ): super().__init__() conv_dim = 512 - self.feature_extractor = ConvFeatureEncoder(conv_dim, device=device, dtype=dtype, operations=operations) + self.feature_extractor = ConvFeatureEncoder(conv_dim, conv_norm=conv_norm, conv_bias=conv_bias, device=device, dtype=dtype, operations=operations) self.feature_projection = FeatureProjection(conv_dim, embed_dim, device=device, dtype=dtype, operations=operations) self.masked_spec_embed = nn.Parameter(torch.empty(embed_dim, device=device, dtype=dtype)) + self.do_normalize = do_normalize self.encoder = TransformerEncoder( embed_dim=embed_dim, num_heads=num_heads, num_layers=num_layers, + do_stable_layer_norm=do_stable_layer_norm, device=device, dtype=dtype, operations=operations ) - def forward(self, x, mask_time_indices=None, return_dict=False): - + def forward(self, x, sr=16000, mask_time_indices=None, return_dict=False): x = torch.mean(x, dim=1) - x = (x - x.mean()) / torch.sqrt(x.var() + 1e-7) + if self.do_normalize: + x = (x - x.mean()) / torch.sqrt(x.var() + 1e-7) features = self.feature_extractor(x) features = self.feature_projection(features) - batch_size, seq_len, _ = features.shape x, all_x = self.encoder(features) - return x, all_x From 29bf807b0e2d89402d555d08bd8e9df15e636f0c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 12 Sep 2025 18:57:04 -0700 Subject: [PATCH 284/759] Cleanup. (#9838) --- comfy/audio_encoders/audio_encoders.py | 2 +- comfy/audio_encoders/wav2vec2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py index d1ec78f69..6fb5b08e9 100644 --- a/comfy/audio_encoders/audio_encoders.py +++ b/comfy/audio_encoders/audio_encoders.py @@ -31,7 +31,7 @@ class AudioEncoderModel(): def encode_audio(self, audio, sample_rate): comfy.model_management.load_model_gpu(self.patcher) audio = torchaudio.functional.resample(audio, sample_rate, self.model_sample_rate) - out, all_layers = self.model(audio.to(self.load_device), sr=self.model_sample_rate) + out, all_layers = self.model(audio.to(self.load_device)) outputs = {} outputs["encoded_audio"] = out outputs["encoded_audio_all_layers"] = all_layers diff --git a/comfy/audio_encoders/wav2vec2.py b/comfy/audio_encoders/wav2vec2.py index ef10dcd2a..4e34a40a7 100644 --- a/comfy/audio_encoders/wav2vec2.py +++ b/comfy/audio_encoders/wav2vec2.py @@ -238,7 +238,7 @@ class Wav2Vec2Model(nn.Module): device=device, dtype=dtype, operations=operations ) - def forward(self, x, sr=16000, mask_time_indices=None, return_dict=False): + def forward(self, x, mask_time_indices=None, return_dict=False): x = torch.mean(x, dim=1) if self.do_normalize: From e5e70636e7b7b54695220a88ab036c1607959736 Mon Sep 17 00:00:00 2001 From: Kimbing Ng <50580578+KimbingNg@users.noreply.github.com> Date: Sun, 14 Sep 2025 04:59:19 +0800 Subject: [PATCH 285/759] Remove single quote pattern to avoid wrong matches (#9842) --- comfy/text_encoders/hunyuan_image.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/comfy/text_encoders/hunyuan_image.py b/comfy/text_encoders/hunyuan_image.py index be396cae7..699eddc33 100644 --- a/comfy/text_encoders/hunyuan_image.py +++ b/comfy/text_encoders/hunyuan_image.py @@ -22,17 +22,14 @@ class HunyuanImageTokenizer(QwenImageTokenizer): # ByT5 processing for HunyuanImage text_prompt_texts = [] - pattern_quote_single = r'\'(.*?)\'' pattern_quote_double = r'\"(.*?)\"' pattern_quote_chinese_single = r'‘(.*?)’' pattern_quote_chinese_double = r'“(.*?)”' - matches_quote_single = re.findall(pattern_quote_single, text) matches_quote_double = re.findall(pattern_quote_double, text) matches_quote_chinese_single = re.findall(pattern_quote_chinese_single, text) matches_quote_chinese_double = re.findall(pattern_quote_chinese_double, text) - text_prompt_texts.extend(matches_quote_single) text_prompt_texts.extend(matches_quote_double) text_prompt_texts.extend(matches_quote_chinese_single) text_prompt_texts.extend(matches_quote_chinese_double) From c1297f4eb38a63e2f99c9fa76e32e3a36c933b85 Mon Sep 17 00:00:00 2001 From: blepping <157360029+blepping@users.noreply.github.com> Date: Sat, 13 Sep 2025 15:58:43 -0600 Subject: [PATCH 286/759] Add support for Chroma Radiance (#9682) * Initial Chroma Radiance support * Minor Chroma Radiance cleanups * Update Radiance nodes to ensure latents/images are on the intermediate device * Fix Chroma Radiance memory estimation. * Increase Chroma Radiance memory usage factor * Increase Chroma Radiance memory usage factor once again * Ensure images are multiples of 16 for Chroma Radiance Add batch dimension and fix channels when necessary in ChromaRadianceImageToLatent node * Tile Chroma Radiance NeRF to reduce memory consumption, update memory usage factor * Update Radiance to support conv nerf final head type. * Allow setting NeRF embedder dtype for Radiance Bump Radiance nerf tile size to 32 Support EasyCache/LazyCache on Radiance (maybe) * Add ChromaRadianceStubVAE node * Crop Radiance image inputs to multiples of 16 instead of erroring to be in line with existing VAE behavior * Convert Chroma Radiance nodes to V3 schema. * Add ChromaRadianceOptions node and backend support. Cleanups/refactoring to reduce code duplication with Chroma. * Fix overriding the NeRF embedder dtype for Chroma Radiance * Minor Chroma Radiance cleanups * Move Chroma Radiance to its own directory in ldm Minor code cleanups and tooltip improvements * Fix Chroma Radiance embedder dtype overriding * Remove Radiance dynamic nerf_embedder dtype override feature * Unbork Radiance NeRF embedder init * Remove Chroma Radiance image conversion and stub VAE nodes Add a chroma_radiance option to the VAELoader builtin node which uses comfy.sd.PixelspaceConversionVAE Add a PixelspaceConversionVAE to comfy.sd for converting BHWC 0..1 <-> BCHW -1..1 --- comfy/latent_formats.py | 17 ++ comfy/ldm/chroma/model.py | 10 +- comfy/ldm/chroma_radiance/layers.py | 206 ++++++++++++++++ comfy/ldm/chroma_radiance/model.py | 328 ++++++++++++++++++++++++++ comfy/model_base.py | 9 +- comfy/model_detection.py | 14 +- comfy/sd.py | 60 +++++ comfy/supported_models.py | 15 +- comfy_extras/nodes_chroma_radiance.py | 114 +++++++++ nodes.py | 6 +- 10 files changed, 770 insertions(+), 9 deletions(-) create mode 100644 comfy/ldm/chroma_radiance/layers.py create mode 100644 comfy/ldm/chroma_radiance/model.py create mode 100644 comfy_extras/nodes_chroma_radiance.py diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 894540879..77e642a94 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -629,3 +629,20 @@ class Hunyuan3Dv2mini(LatentFormat): class ACEAudio(LatentFormat): latent_channels = 8 latent_dimensions = 2 + +class ChromaRadiance(LatentFormat): + latent_channels = 3 + + def __init__(self): + self.latent_rgb_factors = [ + # R G B + [ 1.0, 0.0, 0.0 ], + [ 0.0, 1.0, 0.0 ], + [ 0.0, 0.0, 1.0 ] + ] + + def process_in(self, latent): + return latent + + def process_out(self, latent): + return latent diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py index 4f709f87d..ad1c523fe 100644 --- a/comfy/ldm/chroma/model.py +++ b/comfy/ldm/chroma/model.py @@ -151,8 +151,6 @@ class Chroma(nn.Module): attn_mask: Tensor = None, ) -> Tensor: patches_replace = transformer_options.get("patches_replace", {}) - if img.ndim != 3 or txt.ndim != 3: - raise ValueError("Input img and txt tensors must have 3 dimensions.") # running on sequences img img = self.img_in(img) @@ -254,8 +252,9 @@ class Chroma(nn.Module): img[:, txt.shape[1] :, ...] += add img = img[:, txt.shape[1] :, ...] - final_mod = self.get_modulations(mod_vectors, "final") - img = self.final_layer(img, vec=final_mod) # (N, T, patch_size ** 2 * out_channels) + if hasattr(self, "final_layer"): + final_mod = self.get_modulations(mod_vectors, "final") + img = self.final_layer(img, vec=final_mod) # (N, T, patch_size ** 2 * out_channels) return img def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs): @@ -271,6 +270,9 @@ class Chroma(nn.Module): img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=self.patch_size, pw=self.patch_size) + if img.ndim != 3 or context.ndim != 3: + raise ValueError("Input img and txt tensors must have 3 dimensions.") + h_len = ((h + (self.patch_size // 2)) // self.patch_size) w_len = ((w + (self.patch_size // 2)) // self.patch_size) img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) diff --git a/comfy/ldm/chroma_radiance/layers.py b/comfy/ldm/chroma_radiance/layers.py new file mode 100644 index 000000000..3c7bc9b6b --- /dev/null +++ b/comfy/ldm/chroma_radiance/layers.py @@ -0,0 +1,206 @@ +# Adapted from https://github.com/lodestone-rock/flow +from functools import lru_cache + +import torch +from torch import nn + +from comfy.ldm.flux.layers import RMSNorm + + +class NerfEmbedder(nn.Module): + """ + An embedder module that combines input features with a 2D positional + encoding that mimics the Discrete Cosine Transform (DCT). + + This module takes an input tensor of shape (B, P^2, C), where P is the + patch size, and enriches it with positional information before projecting + it to a new hidden size. + """ + def __init__( + self, + in_channels: int, + hidden_size_input: int, + max_freqs: int, + dtype=None, + device=None, + operations=None, + ): + """ + Initializes the NerfEmbedder. + + Args: + in_channels (int): The number of channels in the input tensor. + hidden_size_input (int): The desired dimension of the output embedding. + max_freqs (int): The number of frequency components to use for both + the x and y dimensions of the positional encoding. + The total number of positional features will be max_freqs^2. + """ + super().__init__() + self.dtype = dtype + self.max_freqs = max_freqs + self.hidden_size_input = hidden_size_input + + # A linear layer to project the concatenated input features and + # positional encodings to the final output dimension. + self.embedder = nn.Sequential( + operations.Linear(in_channels + max_freqs**2, hidden_size_input, dtype=dtype, device=device) + ) + + @lru_cache(maxsize=4) + def fetch_pos(self, patch_size: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor: + """ + Generates and caches 2D DCT-like positional embeddings for a given patch size. + + The LRU cache is a performance optimization that avoids recomputing the + same positional grid on every forward pass. + + Args: + patch_size (int): The side length of the square input patch. + device: The torch device to create the tensors on. + dtype: The torch dtype for the tensors. + + Returns: + A tensor of shape (1, patch_size^2, max_freqs^2) containing the + positional embeddings. + """ + # Create normalized 1D coordinate grids from 0 to 1. + pos_x = torch.linspace(0, 1, patch_size, device=device, dtype=dtype) + pos_y = torch.linspace(0, 1, patch_size, device=device, dtype=dtype) + + # Create a 2D meshgrid of coordinates. + pos_y, pos_x = torch.meshgrid(pos_y, pos_x, indexing="ij") + + # Reshape positions to be broadcastable with frequencies. + # Shape becomes (patch_size^2, 1, 1). + pos_x = pos_x.reshape(-1, 1, 1) + pos_y = pos_y.reshape(-1, 1, 1) + + # Create a 1D tensor of frequency values from 0 to max_freqs-1. + freqs = torch.linspace(0, self.max_freqs - 1, self.max_freqs, dtype=dtype, device=device) + + # Reshape frequencies to be broadcastable for creating 2D basis functions. + # freqs_x shape: (1, max_freqs, 1) + # freqs_y shape: (1, 1, max_freqs) + freqs_x = freqs[None, :, None] + freqs_y = freqs[None, None, :] + + # A custom weighting coefficient, not part of standard DCT. + # This seems to down-weight the contribution of higher-frequency interactions. + coeffs = (1 + freqs_x * freqs_y) ** -1 + + # Calculate the 1D cosine basis functions for x and y coordinates. + # This is the core of the DCT formulation. + dct_x = torch.cos(pos_x * freqs_x * torch.pi) + dct_y = torch.cos(pos_y * freqs_y * torch.pi) + + # Combine the 1D basis functions to create 2D basis functions by element-wise + # multiplication, and apply the custom coefficients. Broadcasting handles the + # combination of all (pos_x, freqs_x) with all (pos_y, freqs_y). + # The result is flattened into a feature vector for each position. + dct = (dct_x * dct_y * coeffs).view(1, -1, self.max_freqs ** 2) + + return dct + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + """ + Forward pass for the embedder. + + Args: + inputs (Tensor): The input tensor of shape (B, P^2, C). + + Returns: + Tensor: The output tensor of shape (B, P^2, hidden_size_input). + """ + # Get the batch size, number of pixels, and number of channels. + B, P2, C = inputs.shape + + # Infer the patch side length from the number of pixels (P^2). + patch_size = int(P2 ** 0.5) + + input_dtype = inputs.dtype + inputs = inputs.to(dtype=self.dtype) + + # Fetch the pre-computed or cached positional embeddings. + dct = self.fetch_pos(patch_size, inputs.device, self.dtype) + + # Repeat the positional embeddings for each item in the batch. + dct = dct.repeat(B, 1, 1) + + # Concatenate the original input features with the positional embeddings + # along the feature dimension. + inputs = torch.cat((inputs, dct), dim=-1) + + # Project the combined tensor to the target hidden size. + return self.embedder(inputs).to(dtype=input_dtype) + + +class NerfGLUBlock(nn.Module): + """ + A NerfBlock using a Gated Linear Unit (GLU) like MLP. + """ + def __init__(self, hidden_size_s: int, hidden_size_x: int, mlp_ratio, dtype=None, device=None, operations=None): + super().__init__() + # The total number of parameters for the MLP is increased to accommodate + # the gate, value, and output projection matrices. + # We now need to generate parameters for 3 matrices. + total_params = 3 * hidden_size_x**2 * mlp_ratio + self.param_generator = operations.Linear(hidden_size_s, total_params, dtype=dtype, device=device) + self.norm = RMSNorm(hidden_size_x, dtype=dtype, device=device, operations=operations) + self.mlp_ratio = mlp_ratio + + + def forward(self, x: torch.Tensor, s: torch.Tensor) -> torch.Tensor: + batch_size, num_x, hidden_size_x = x.shape + mlp_params = self.param_generator(s) + + # Split the generated parameters into three parts for the gate, value, and output projection. + fc1_gate_params, fc1_value_params, fc2_params = mlp_params.chunk(3, dim=-1) + + # Reshape the parameters into matrices for batch matrix multiplication. + fc1_gate = fc1_gate_params.view(batch_size, hidden_size_x, hidden_size_x * self.mlp_ratio) + fc1_value = fc1_value_params.view(batch_size, hidden_size_x, hidden_size_x * self.mlp_ratio) + fc2 = fc2_params.view(batch_size, hidden_size_x * self.mlp_ratio, hidden_size_x) + + # Normalize the generated weight matrices as in the original implementation. + fc1_gate = torch.nn.functional.normalize(fc1_gate, dim=-2) + fc1_value = torch.nn.functional.normalize(fc1_value, dim=-2) + fc2 = torch.nn.functional.normalize(fc2, dim=-2) + + res_x = x + x = self.norm(x) + + # Apply the final output projection. + x = torch.bmm(torch.nn.functional.silu(torch.bmm(x, fc1_gate)) * torch.bmm(x, fc1_value), fc2) + + return x + res_x + + +class NerfFinalLayer(nn.Module): + def __init__(self, hidden_size, out_channels, dtype=None, device=None, operations=None): + super().__init__() + self.norm = RMSNorm(hidden_size, dtype=dtype, device=device, operations=operations) + self.linear = operations.Linear(hidden_size, out_channels, dtype=dtype, device=device) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # RMSNorm normalizes over the last dimension, but our channel dim (C) is at dim=1. + # So we temporarily move the channel dimension to the end for the norm operation. + return self.linear(self.norm(x.movedim(1, -1))).movedim(-1, 1) + + +class NerfFinalLayerConv(nn.Module): + def __init__(self, hidden_size: int, out_channels: int, dtype=None, device=None, operations=None): + super().__init__() + self.norm = RMSNorm(hidden_size, dtype=dtype, device=device, operations=operations) + self.conv = operations.Conv2d( + in_channels=hidden_size, + out_channels=out_channels, + kernel_size=3, + padding=1, + dtype=dtype, + device=device, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # RMSNorm normalizes over the last dimension, but our channel dim (C) is at dim=1. + # So we temporarily move the channel dimension to the end for the norm operation. + return self.conv(self.norm(x.movedim(1, -1)).movedim(-1, 1)) diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py new file mode 100644 index 000000000..f7eb7a22e --- /dev/null +++ b/comfy/ldm/chroma_radiance/model.py @@ -0,0 +1,328 @@ +# Credits: +# Original Flux code can be found on: https://github.com/black-forest-labs/flux +# Chroma Radiance adaption referenced from https://github.com/lodestone-rock/flow + +from dataclasses import dataclass +from typing import Optional + +import torch +from torch import Tensor, nn +from einops import repeat +import comfy.ldm.common_dit + +from comfy.ldm.flux.layers import EmbedND + +from comfy.ldm.chroma.model import Chroma, ChromaParams +from comfy.ldm.chroma.layers import ( + DoubleStreamBlock, + SingleStreamBlock, + Approximator, +) +from .layers import ( + NerfEmbedder, + NerfGLUBlock, + NerfFinalLayer, + NerfFinalLayerConv, +) + + +@dataclass +class ChromaRadianceParams(ChromaParams): + patch_size: int + nerf_hidden_size: int + nerf_mlp_ratio: int + nerf_depth: int + nerf_max_freqs: int + # Setting nerf_tile_size to 0 disables tiling. + nerf_tile_size: int + # Currently one of linear (legacy) or conv. + nerf_final_head_type: str + # None means use the same dtype as the model. + nerf_embedder_dtype: Optional[torch.dtype] + + +class ChromaRadiance(Chroma): + """ + Transformer model for flow matching on sequences. + """ + + def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs): + if operations is None: + raise RuntimeError("Attempt to create ChromaRadiance object without setting operations") + nn.Module.__init__(self) + self.dtype = dtype + params = ChromaRadianceParams(**kwargs) + self.params = params + self.patch_size = params.patch_size + self.in_channels = params.in_channels + self.out_channels = params.out_channels + if params.hidden_size % params.num_heads != 0: + raise ValueError( + f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}" + ) + pe_dim = params.hidden_size // params.num_heads + if sum(params.axes_dim) != pe_dim: + raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}") + self.hidden_size = params.hidden_size + self.num_heads = params.num_heads + self.in_dim = params.in_dim + self.out_dim = params.out_dim + self.hidden_dim = params.hidden_dim + self.n_layers = params.n_layers + self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim) + self.img_in_patch = operations.Conv2d( + params.in_channels, + params.hidden_size, + kernel_size=params.patch_size, + stride=params.patch_size, + bias=True, + dtype=dtype, + device=device, + ) + self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device) + # set as nn identity for now, will overwrite it later. + self.distilled_guidance_layer = Approximator( + in_dim=self.in_dim, + hidden_dim=self.hidden_dim, + out_dim=self.out_dim, + n_layers=self.n_layers, + dtype=dtype, device=device, operations=operations + ) + + + self.double_blocks = nn.ModuleList( + [ + DoubleStreamBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=params.mlp_ratio, + qkv_bias=params.qkv_bias, + dtype=dtype, device=device, operations=operations + ) + for _ in range(params.depth) + ] + ) + + self.single_blocks = nn.ModuleList( + [ + SingleStreamBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=params.mlp_ratio, + dtype=dtype, device=device, operations=operations, + ) + for _ in range(params.depth_single_blocks) + ] + ) + + # pixel channel concat with DCT + self.nerf_image_embedder = NerfEmbedder( + in_channels=params.in_channels, + hidden_size_input=params.nerf_hidden_size, + max_freqs=params.nerf_max_freqs, + dtype=params.nerf_embedder_dtype or dtype, + device=device, + operations=operations, + ) + + self.nerf_blocks = nn.ModuleList([ + NerfGLUBlock( + hidden_size_s=params.hidden_size, + hidden_size_x=params.nerf_hidden_size, + mlp_ratio=params.nerf_mlp_ratio, + dtype=dtype, + device=device, + operations=operations, + ) for _ in range(params.nerf_depth) + ]) + + if params.nerf_final_head_type == "linear": + self.nerf_final_layer = NerfFinalLayer( + params.nerf_hidden_size, + out_channels=params.in_channels, + dtype=dtype, + device=device, + operations=operations, + ) + elif params.nerf_final_head_type == "conv": + self.nerf_final_layer_conv = NerfFinalLayerConv( + params.nerf_hidden_size, + out_channels=params.in_channels, + dtype=dtype, + device=device, + operations=operations, + ) + else: + errstr = f"Unsupported nerf_final_head_type {params.nerf_final_head_type}" + raise ValueError(errstr) + + self.skip_mmdit = [] + self.skip_dit = [] + self.lite = False + + @property + def _nerf_final_layer(self) -> nn.Module: + if self.params.nerf_final_head_type == "linear": + return self.nerf_final_layer + if self.params.nerf_final_head_type == "conv": + return self.nerf_final_layer_conv + # Impossible to get here as we raise an error on unexpected types on initialization. + raise NotImplementedError + + def img_in(self, img: Tensor) -> Tensor: + img = self.img_in_patch(img) # -> [B, Hidden, H/P, W/P] + # flatten into a sequence for the transformer. + return img.flatten(2).transpose(1, 2) # -> [B, NumPatches, Hidden] + + def forward_nerf( + self, + img_orig: Tensor, + img_out: Tensor, + params: ChromaRadianceParams, + ) -> Tensor: + B, C, H, W = img_orig.shape + num_patches = img_out.shape[1] + patch_size = params.patch_size + + # Store the raw pixel values of each patch for the NeRF head later. + # unfold creates patches: [B, C * P * P, NumPatches] + nerf_pixels = nn.functional.unfold(img_orig, kernel_size=patch_size, stride=patch_size) + nerf_pixels = nerf_pixels.transpose(1, 2) # -> [B, NumPatches, C * P * P] + + if params.nerf_tile_size > 0 and num_patches > params.nerf_tile_size: + # Enable tiling if nerf_tile_size isn't 0 and we actually have more patches than + # the tile size. + img_dct = self.forward_tiled_nerf(img_out, nerf_pixels, B, C, num_patches, patch_size, params) + else: + # Reshape for per-patch processing + nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size) + nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2) + + # Get DCT-encoded pixel embeddings [pixel-dct] + img_dct = self.nerf_image_embedder(nerf_pixels) + + # Pass through the dynamic MLP blocks (the NeRF) + for block in self.nerf_blocks: + img_dct = block(img_dct, nerf_hidden) + + # Reassemble the patches into the final image. + img_dct = img_dct.transpose(1, 2) # -> [B*NumPatches, C, P*P] + # Reshape to combine with batch dimension for fold + img_dct = img_dct.reshape(B, num_patches, -1) # -> [B, NumPatches, C*P*P] + img_dct = img_dct.transpose(1, 2) # -> [B, C*P*P, NumPatches] + img_dct = nn.functional.fold( + img_dct, + output_size=(H, W), + kernel_size=patch_size, + stride=patch_size, + ) + return self._nerf_final_layer(img_dct) + + def forward_tiled_nerf( + self, + nerf_hidden: Tensor, + nerf_pixels: Tensor, + batch: int, + channels: int, + num_patches: int, + patch_size: int, + params: ChromaRadianceParams, + ) -> Tensor: + """ + Processes the NeRF head in tiles to save memory. + nerf_hidden has shape [B, L, D] + nerf_pixels has shape [B, L, C * P * P] + """ + tile_size = params.nerf_tile_size + output_tiles = [] + # Iterate over the patches in tiles. The dimension L (num_patches) is at index 1. + for i in range(0, num_patches, tile_size): + end = min(i + tile_size, num_patches) + + # Slice the current tile from the input tensors + nerf_hidden_tile = nerf_hidden[:, i:end, :] + nerf_pixels_tile = nerf_pixels[:, i:end, :] + + # Get the actual number of patches in this tile (can be smaller for the last tile) + num_patches_tile = nerf_hidden_tile.shape[1] + + # Reshape the tile for per-patch processing + # [B, NumPatches_tile, D] -> [B * NumPatches_tile, D] + nerf_hidden_tile = nerf_hidden_tile.reshape(batch * num_patches_tile, params.hidden_size) + # [B, NumPatches_tile, C*P*P] -> [B*NumPatches_tile, C, P*P] -> [B*NumPatches_tile, P*P, C] + nerf_pixels_tile = nerf_pixels_tile.reshape(batch * num_patches_tile, channels, patch_size**2).transpose(1, 2) + + # get DCT-encoded pixel embeddings [pixel-dct] + img_dct_tile = self.nerf_image_embedder(nerf_pixels_tile) + + # pass through the dynamic MLP blocks (the NeRF) + for block in self.nerf_blocks: + img_dct_tile = block(img_dct_tile, nerf_hidden_tile) + + output_tiles.append(img_dct_tile) + + # Concatenate the processed tiles along the patch dimension + return torch.cat(output_tiles, dim=0) + + def radiance_get_override_params(self, overrides: dict) -> ChromaRadianceParams: + params = self.params + if not overrides: + return params + params_dict = {k: getattr(params, k) for k in params.__dataclass_fields__} + nullable_keys = frozenset(("nerf_embedder_dtype",)) + bad_keys = tuple(k for k in overrides if k not in params_dict) + if bad_keys: + e = f"Unknown key(s) in transformer_options chroma_radiance_options: {', '.join(bad_keys)}" + raise ValueError(e) + bad_keys = tuple( + k + for k, v in overrides.items() + if type(v) != type(getattr(params, k)) and (v is not None or k not in nullable_keys) + ) + if bad_keys: + e = f"Invalid value(s) in transformer_options chroma_radiance_options: {', '.join(bad_keys)}" + raise ValueError(e) + # At this point it's all valid keys and values so we can merge with the existing params. + params_dict |= overrides + return params.__class__(**params_dict) + + def _forward( + self, + x: Tensor, + timestep: Tensor, + context: Tensor, + guidance: Optional[Tensor], + control: Optional[dict]=None, + transformer_options: dict={}, + **kwargs: dict, + ) -> Tensor: + bs, c, h, w = x.shape + img = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size)) + + if img.ndim != 4: + raise ValueError("Input img tensor must be in [B, C, H, W] format.") + if context.ndim != 3: + raise ValueError("Input txt tensors must have 3 dimensions.") + + params = self.radiance_get_override_params(transformer_options.get("chroma_radiance_options", {})) + + h_len = ((h + (self.patch_size // 2)) // self.patch_size) + w_len = ((w + (self.patch_size // 2)) // self.patch_size) + img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) + img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) + img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) + img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs) + txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype) + + img_out = self.forward_orig( + img, + img_ids, + context, + txt_ids, + timestep, + guidance, + control, + transformer_options, + attn_mask=kwargs.get("attention_mask", None), + ) + return self.forward_nerf(img, img_out, params) diff --git a/comfy/model_base.py b/comfy/model_base.py index 324d89cff..252dfcf69 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -42,6 +42,7 @@ import comfy.ldm.wan.model import comfy.ldm.hunyuan3d.model import comfy.ldm.hidream.model import comfy.ldm.chroma.model +import comfy.ldm.chroma_radiance.model import comfy.ldm.ace.model import comfy.ldm.omnigen.omnigen2 import comfy.ldm.qwen_image.model @@ -1320,8 +1321,8 @@ class HiDream(BaseModel): return out class Chroma(Flux): - def __init__(self, model_config, model_type=ModelType.FLUX, device=None): - super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.chroma.model.Chroma) + def __init__(self, model_config, model_type=ModelType.FLUX, device=None, unet_model=comfy.ldm.chroma.model.Chroma): + super().__init__(model_config, model_type, device=device, unet_model=unet_model) def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) @@ -1331,6 +1332,10 @@ class Chroma(Flux): out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance])) return out +class ChromaRadiance(Chroma): + def __init__(self, model_config, model_type=ModelType.FLUX, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.chroma_radiance.model.ChromaRadiance) + class ACEStep(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.ace.model.ACEStepTransformer2DModel) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index fe983cede..03d44f65e 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -174,7 +174,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["guidance_embed"] = len(guidance_keys) > 0 return dit_config - if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and '{}img_in.weight'.format(key_prefix) in state_dict_keys: #Flux + if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight) dit_config = {} dit_config["image_model"] = "flux" dit_config["in_channels"] = 16 @@ -204,6 +204,18 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["out_dim"] = 3072 dit_config["hidden_dim"] = 5120 dit_config["n_layers"] = 5 + if f"{key_prefix}nerf_blocks.0.norm.scale" in state_dict_keys: #Chroma Radiance + dit_config["image_model"] = "chroma_radiance" + dit_config["in_channels"] = 3 + dit_config["out_channels"] = 3 + dit_config["patch_size"] = 16 + dit_config["nerf_hidden_size"] = 64 + dit_config["nerf_mlp_ratio"] = 4 + dit_config["nerf_depth"] = 4 + dit_config["nerf_max_freqs"] = 8 + dit_config["nerf_tile_size"] = 32 + dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear" + dit_config["nerf_embedder_dtype"] = torch.float32 else: dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys return dit_config diff --git a/comfy/sd.py b/comfy/sd.py index f8f1a89e8..cb92802e9 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -785,6 +785,66 @@ class VAE: except: return None +# "Fake" VAE that converts from IMAGE B, H, W, C and values on the scale of 0..1 +# to LATENT B, C, H, W and values on the scale of -1..1. +class PixelspaceConversionVAE: + def __init__(self, size_increment: int=16): + self.intermediate_device = comfy.model_management.intermediate_device() + self.size_increment = size_increment + + def vae_encode_crop_pixels(self, pixels: torch.Tensor) -> torch.Tensor: + if self.size_increment == 1: + return pixels + dims = pixels.shape[1:-1] + for d in range(len(dims)): + d_adj = (dims[d] // self.size_increment) * self.size_increment + if d_adj == d: + continue + d_offset = (dims[d] % self.size_increment) // 2 + pixels = pixels.narrow(d + 1, d_offset, d_adj) + return pixels + + def encode(self, pixels: torch.Tensor, *_args, **_kwargs) -> torch.Tensor: + if pixels.ndim == 3: + pixels = pixels.unsqueeze(0) + elif pixels.ndim != 4: + raise ValueError("Unexpected input image shape") + # Ensure the image has spatial dimensions that are multiples of 16. + pixels = self.vae_encode_crop_pixels(pixels) + h, w, c = pixels.shape[1:] + if h < self.size_increment or w < self.size_increment: + raise ValueError(f"Image inputs must have height/width of at least {self.size_increment} pixel(s).") + pixels= pixels[..., :3] + if c == 1: + pixels = pixels.expand(-1, -1, -1, 3) + elif c != 3: + raise ValueError("Unexpected number of channels in input image") + # Rescale to -1..1 and move the channel dimension to position 1. + latent = pixels.to(device=self.intermediate_device, dtype=torch.float32, copy=True) + latent = latent.clamp_(0, 1).movedim(-1, 1).contiguous() + latent -= 0.5 + latent *= 2 + return latent.clamp_(-1, 1) + + def decode(self, samples: torch.Tensor, *_args, **_kwargs) -> torch.Tensor: + # Rescale to 0..1 and move the channel dimension to the end. + img = samples.to(device=self.intermediate_device, dtype=torch.float32, copy=True) + img = img.clamp_(-1, 1).movedim(1, -1).contiguous() + img += 1.0 + img *= 0.5 + return img.clamp_(0, 1) + + encode_tiled = encode + decode_tiled = decode + + @classmethod + def spacial_compression_decode(cls) -> int: + # This just exists so the tiled VAE nodes don't crash. + return 1 + + spacial_compression_encode = spacial_compression_decode + temporal_compression_decode = spacial_compression_decode + class StyleModel: def __init__(self, model, device="cpu"): self.model = model diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 472ea0ae9..be36b5dfe 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1205,6 +1205,19 @@ class Chroma(supported_models_base.BASE): t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.pixart_t5.PixArtTokenizer, comfy.text_encoders.pixart_t5.pixart_te(**t5_detect)) +class ChromaRadiance(Chroma): + unet_config = { + "image_model": "chroma_radiance", + } + + latent_format = comfy.latent_formats.ChromaRadiance + + # Pixel-space model, no spatial compression for model input. + memory_usage_factor = 0.0325 + + def get_model(self, state_dict, prefix="", device=None): + return model_base.ChromaRadiance(self, device=device) + class ACEStep(supported_models_base.BASE): unet_config = { "audio_model": "ace", @@ -1338,6 +1351,6 @@ class HunyuanImage21Refiner(HunyuanVideo): out = model_base.HunyuanImage21Refiner(self, device=device) return out -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ACEStep, Omnigen2, QwenImage] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_chroma_radiance.py b/comfy_extras/nodes_chroma_radiance.py new file mode 100644 index 000000000..381989818 --- /dev/null +++ b/comfy_extras/nodes_chroma_radiance.py @@ -0,0 +1,114 @@ +from typing_extensions import override +from typing import Callable + +import torch + +import comfy.model_management +from comfy_api.latest import ComfyExtension, io + +import nodes + +class EmptyChromaRadianceLatentImage(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="EmptyChromaRadianceLatentImage", + category="latent/chroma_radiance", + inputs=[ + io.Int.Input(id="width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input(id="height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input(id="batch_size", default=1, min=1, max=4096), + ], + outputs=[io.Latent().Output()], + ) + + @classmethod + def execute(cls, *, width: int, height: int, batch_size: int=1) -> io.NodeOutput: + latent = torch.zeros((batch_size, 3, height, width), device=comfy.model_management.intermediate_device()) + return io.NodeOutput({"samples":latent}) + + +class ChromaRadianceOptions(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="ChromaRadianceOptions", + category="model_patches/chroma_radiance", + description="Allows setting advanced options for the Chroma Radiance model.", + inputs=[ + io.Model.Input(id="model"), + io.Boolean.Input( + id="preserve_wrapper", + default=True, + tooltip="When enabled, will delegate to an existing model function wrapper if it exists. Generally should be left enabled.", + ), + io.Float.Input( + id="start_sigma", + default=1.0, + min=0.0, + max=1.0, + tooltip="First sigma that these options will be in effect.", + ), + io.Float.Input( + id="end_sigma", + default=0.0, + min=0.0, + max=1.0, + tooltip="Last sigma that these options will be in effect.", + ), + io.Int.Input( + id="nerf_tile_size", + default=-1, + min=-1, + tooltip="Allows overriding the default NeRF tile size. -1 means use the default (32). 0 means use non-tiling mode (may require a lot of VRAM).", + ), + ], + outputs=[io.Model.Output()], + ) + + @classmethod + def execute( + cls, + *, + model: io.Model.Type, + preserve_wrapper: bool, + start_sigma: float, + end_sigma: float, + nerf_tile_size: int, + ) -> io.NodeOutput: + radiance_options = {} + if nerf_tile_size >= 0: + radiance_options["nerf_tile_size"] = nerf_tile_size + + if not radiance_options: + return io.NodeOutput(model) + + old_wrapper = model.model_options.get("model_function_wrapper") + + def model_function_wrapper(apply_model: Callable, args: dict) -> torch.Tensor: + c = args["c"].copy() + sigma = args["timestep"].max().detach().cpu().item() + if end_sigma <= sigma <= start_sigma: + transformer_options = c.get("transformer_options", {}).copy() + transformer_options["chroma_radiance_options"] = radiance_options.copy() + c["transformer_options"] = transformer_options + if not (preserve_wrapper and old_wrapper): + return apply_model(args["input"], args["timestep"], **c) + return old_wrapper(apply_model, args | {"c": c}) + + model = model.clone() + model.set_model_unet_function_wrapper(model_function_wrapper) + return io.NodeOutput(model) + + +class ChromaRadianceExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + EmptyChromaRadianceLatentImage, + ChromaRadianceOptions, + ] + + +async def comfy_entrypoint() -> ChromaRadianceExtension: + return ChromaRadianceExtension() diff --git a/nodes.py b/nodes.py index 2befb4b75..76b8cbac8 100644 --- a/nodes.py +++ b/nodes.py @@ -730,6 +730,7 @@ class VAELoader: vaes.append("taesd3") if f1_taesd_dec and f1_taesd_enc: vaes.append("taef1") + vaes.append("chroma_radiance") return vaes @staticmethod @@ -772,7 +773,9 @@ class VAELoader: #TODO: scale factor? def load_vae(self, vae_name): - if vae_name in ["taesd", "taesdxl", "taesd3", "taef1"]: + if vae_name == "chroma_radiance": + return (comfy.sd.PixelspaceConversionVAE(),) + elif vae_name in ["taesd", "taesdxl", "taesd3", "taef1"]: sd = self.load_taesd(vae_name) else: vae_path = folder_paths.get_full_path_or_raise("vae", vae_name) @@ -2322,6 +2325,7 @@ async def init_builtin_extra_nodes(): "nodes_tcfg.py", "nodes_context_windows.py", "nodes_qwen.py", + "nodes_chroma_radiance.py", "nodes_model_patch.py", "nodes_easycache.py", "nodes_audio_encoder.py", From 80b7c9455bf7afba7a9e95a1eb76b172408ab56c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 13 Sep 2025 15:03:34 -0700 Subject: [PATCH 287/759] Changes to the previous radiance commit. (#9851) --- comfy/ldm/chroma_radiance/model.py | 7 +-- comfy/pixel_space_convert.py | 16 +++++++ comfy/sd.py | 69 +++++------------------------- comfy/supported_models.py | 2 +- nodes.py | 7 +-- 5 files changed, 35 insertions(+), 66 deletions(-) create mode 100644 comfy/pixel_space_convert.py diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py index f7eb7a22e..47aa11b04 100644 --- a/comfy/ldm/chroma_radiance/model.py +++ b/comfy/ldm/chroma_radiance/model.py @@ -306,8 +306,9 @@ class ChromaRadiance(Chroma): params = self.radiance_get_override_params(transformer_options.get("chroma_radiance_options", {})) - h_len = ((h + (self.patch_size // 2)) // self.patch_size) - w_len = ((w + (self.patch_size // 2)) // self.patch_size) + h_len = (img.shape[-2] // self.patch_size) + w_len = (img.shape[-1] // self.patch_size) + img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) @@ -325,4 +326,4 @@ class ChromaRadiance(Chroma): transformer_options, attn_mask=kwargs.get("attention_mask", None), ) - return self.forward_nerf(img, img_out, params) + return self.forward_nerf(img, img_out, params)[:, :, :h, :w] diff --git a/comfy/pixel_space_convert.py b/comfy/pixel_space_convert.py new file mode 100644 index 000000000..049bbcfb4 --- /dev/null +++ b/comfy/pixel_space_convert.py @@ -0,0 +1,16 @@ +import torch + + +# "Fake" VAE that converts from IMAGE B, H, W, C and values on the scale of 0..1 +# to LATENT B, C, H, W and values on the scale of -1..1. +class PixelspaceConversionVAE(torch.nn.Module): + def __init__(self): + super().__init__() + self.pixel_space_vae = torch.nn.Parameter(torch.tensor(1.0)) + + def encode(self, pixels: torch.Tensor, *_args, **_kwargs) -> torch.Tensor: + return pixels + + def decode(self, samples: torch.Tensor, *_args, **_kwargs) -> torch.Tensor: + return samples + diff --git a/comfy/sd.py b/comfy/sd.py index cb92802e9..2df340739 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -18,6 +18,7 @@ import comfy.ldm.wan.vae2_2 import comfy.ldm.hunyuan3d.vae import comfy.ldm.ace.vae.music_dcae_pipeline import comfy.ldm.hunyuan_video.vae +import comfy.pixel_space_convert import yaml import math import os @@ -516,6 +517,15 @@ class VAE: self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] self.disable_offload = True self.extra_1d_channel = 16 + elif "pixel_space_vae" in sd: + self.first_stage_model = comfy.pixel_space_convert.PixelspaceConversionVAE() + self.memory_used_encode = lambda shape, dtype: (1 * shape[2] * shape[3]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (1 * shape[2] * shape[3]) * model_management.dtype_size(dtype) + self.downscale_ratio = 1 + self.upscale_ratio = 1 + self.latent_channels = 3 + self.latent_dim = 2 + self.output_channels = 3 else: logging.warning("WARNING: No VAE weights detected, VAE not initalized.") self.first_stage_model = None @@ -785,65 +795,6 @@ class VAE: except: return None -# "Fake" VAE that converts from IMAGE B, H, W, C and values on the scale of 0..1 -# to LATENT B, C, H, W and values on the scale of -1..1. -class PixelspaceConversionVAE: - def __init__(self, size_increment: int=16): - self.intermediate_device = comfy.model_management.intermediate_device() - self.size_increment = size_increment - - def vae_encode_crop_pixels(self, pixels: torch.Tensor) -> torch.Tensor: - if self.size_increment == 1: - return pixels - dims = pixels.shape[1:-1] - for d in range(len(dims)): - d_adj = (dims[d] // self.size_increment) * self.size_increment - if d_adj == d: - continue - d_offset = (dims[d] % self.size_increment) // 2 - pixels = pixels.narrow(d + 1, d_offset, d_adj) - return pixels - - def encode(self, pixels: torch.Tensor, *_args, **_kwargs) -> torch.Tensor: - if pixels.ndim == 3: - pixels = pixels.unsqueeze(0) - elif pixels.ndim != 4: - raise ValueError("Unexpected input image shape") - # Ensure the image has spatial dimensions that are multiples of 16. - pixels = self.vae_encode_crop_pixels(pixels) - h, w, c = pixels.shape[1:] - if h < self.size_increment or w < self.size_increment: - raise ValueError(f"Image inputs must have height/width of at least {self.size_increment} pixel(s).") - pixels= pixels[..., :3] - if c == 1: - pixels = pixels.expand(-1, -1, -1, 3) - elif c != 3: - raise ValueError("Unexpected number of channels in input image") - # Rescale to -1..1 and move the channel dimension to position 1. - latent = pixels.to(device=self.intermediate_device, dtype=torch.float32, copy=True) - latent = latent.clamp_(0, 1).movedim(-1, 1).contiguous() - latent -= 0.5 - latent *= 2 - return latent.clamp_(-1, 1) - - def decode(self, samples: torch.Tensor, *_args, **_kwargs) -> torch.Tensor: - # Rescale to 0..1 and move the channel dimension to the end. - img = samples.to(device=self.intermediate_device, dtype=torch.float32, copy=True) - img = img.clamp_(-1, 1).movedim(1, -1).contiguous() - img += 1.0 - img *= 0.5 - return img.clamp_(0, 1) - - encode_tiled = encode - decode_tiled = decode - - @classmethod - def spacial_compression_decode(cls) -> int: - # This just exists so the tiled VAE nodes don't crash. - return 1 - - spacial_compression_encode = spacial_compression_decode - temporal_compression_decode = spacial_compression_decode class StyleModel: def __init__(self, model, device="cpu"): diff --git a/comfy/supported_models.py b/comfy/supported_models.py index be36b5dfe..557902d11 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1213,7 +1213,7 @@ class ChromaRadiance(Chroma): latent_format = comfy.latent_formats.ChromaRadiance # Pixel-space model, no spatial compression for model input. - memory_usage_factor = 0.0325 + memory_usage_factor = 0.038 def get_model(self, state_dict, prefix="", device=None): return model_base.ChromaRadiance(self, device=device) diff --git a/nodes.py b/nodes.py index 76b8cbac8..5a5fdcb8e 100644 --- a/nodes.py +++ b/nodes.py @@ -730,7 +730,7 @@ class VAELoader: vaes.append("taesd3") if f1_taesd_dec and f1_taesd_enc: vaes.append("taef1") - vaes.append("chroma_radiance") + vaes.append("pixel_space") return vaes @staticmethod @@ -773,8 +773,9 @@ class VAELoader: #TODO: scale factor? def load_vae(self, vae_name): - if vae_name == "chroma_radiance": - return (comfy.sd.PixelspaceConversionVAE(),) + if vae_name == "pixel_space": + sd = {} + sd["pixel_space_vae"] = torch.tensor(1.0) elif vae_name in ["taesd", "taesdxl", "taesd3", "taef1"]: sd = self.load_taesd(vae_name) else: From f228367c5e3906de194968fa9b6fbe7aa9987bfa Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Sat, 13 Sep 2025 18:34:21 -0700 Subject: [PATCH 288/759] Make ModuleNotFoundError ImportError instead (#9850) --- comfy/ldm/modules/attention.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index bf2553c37..9dd1a43c1 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -22,7 +22,7 @@ SAGE_ATTENTION_IS_AVAILABLE = False try: from sageattention import sageattn SAGE_ATTENTION_IS_AVAILABLE = True -except ModuleNotFoundError as e: +except ImportError as e: if model_management.sage_attention_enabled(): if e.name == "sageattention": logging.error(f"\n\nTo use the `--use-sage-attention` feature, the `sageattention` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install sageattention") @@ -34,7 +34,7 @@ FLASH_ATTENTION_IS_AVAILABLE = False try: from flash_attn import flash_attn_func FLASH_ATTENTION_IS_AVAILABLE = True -except ModuleNotFoundError: +except ImportError: if model_management.flash_attention_enabled(): logging.error(f"\n\nTo use the `--use-flash-attention` feature, the `flash-attn` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install flash-attn") exit(-1) From 4f1f26ac6c11b803bbc83cb347178e2f9b5e421b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 14 Sep 2025 01:05:38 -0700 Subject: [PATCH 289/759] Add that hunyuan image is supported to readme. (#9857) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8024870c2..3f6cfc2ed 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/) - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/) - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/) + - [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/) - Image Editing Models - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/) - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model) From 47a9cde5d3045c42f20baafb9855fb96959124f0 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 15 Sep 2025 15:10:55 -0700 Subject: [PATCH 290/759] Support the omnigen2 umo lora. (#9886) --- comfy/lora.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/comfy/lora.py b/comfy/lora.py index 4a44f1318..36d26293a 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -297,6 +297,12 @@ def model_lora_keys_unet(model, key_map={}): key_lora = k[len("diffusion_model."):-len(".weight")] key_map["{}".format(key_lora)] = k + if isinstance(model, comfy.model_base.Omnigen2): + for k in sdk: + if k.startswith("diffusion_model.") and k.endswith(".weight"): + key_lora = k[len("diffusion_model."):-len(".weight")] + key_map["{}".format(key_lora)] = k + if isinstance(model, comfy.model_base.QwenImage): for k in sdk: if k.startswith("diffusion_model.") and k.endswith(".weight"): #QwenImage lora format From 1a85483da159f2800407ae5a8a45eb0d88ffce2d Mon Sep 17 00:00:00 2001 From: blepping <157360029+blepping@users.noreply.github.com> Date: Mon, 15 Sep 2025 18:05:03 -0600 Subject: [PATCH 291/759] Fix depending on asserts to raise an exception in BatchedBrownianTree and Flash attn module (#9884) Correctly handle the case where w0 is passed by kwargs in BatchedBrownianTree --- comfy/k_diffusion/sampling.py | 35 +++++++++++++++++----------------- comfy/ldm/modules/attention.py | 3 ++- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index 2d7e09838..0e2cda291 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -86,24 +86,24 @@ class BatchedBrownianTree: """A wrapper around torchsde.BrownianTree that enables batches of entropy.""" def __init__(self, x, t0, t1, seed=None, **kwargs): - self.cpu_tree = True - if "cpu" in kwargs: - self.cpu_tree = kwargs.pop("cpu") + self.cpu_tree = kwargs.pop("cpu", True) t0, t1, self.sign = self.sort(t0, t1) - w0 = kwargs.get('w0', torch.zeros_like(x)) + w0 = kwargs.pop('w0', None) + if w0 is None: + w0 = torch.zeros_like(x) + self.batched = False if seed is None: - seed = torch.randint(0, 2 ** 63 - 1, []).item() - self.batched = True - try: - assert len(seed) == x.shape[0] + seed = (torch.randint(0, 2 ** 63 - 1, ()).item(),) + elif isinstance(seed, (tuple, list)): + if len(seed) != x.shape[0]: + raise ValueError("Passing a list or tuple of seeds to BatchedBrownianTree requires a length matching the batch size.") + self.batched = True w0 = w0[0] - except TypeError: - seed = [seed] - self.batched = False - if self.cpu_tree: - self.trees = [torchsde.BrownianTree(t0.cpu(), w0.cpu(), t1.cpu(), entropy=s, **kwargs) for s in seed] else: - self.trees = [torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed] + seed = (seed,) + if self.cpu_tree: + t0, w0, t1 = t0.detach().cpu(), w0.detach().cpu(), t1.detach().cpu() + self.trees = tuple(torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed) @staticmethod def sort(a, b): @@ -111,11 +111,10 @@ class BatchedBrownianTree: def __call__(self, t0, t1): t0, t1, sign = self.sort(t0, t1) + device, dtype = t0.device, t0.dtype if self.cpu_tree: - w = torch.stack([tree(t0.cpu().float(), t1.cpu().float()).to(t0.dtype).to(t0.device) for tree in self.trees]) * (self.sign * sign) - else: - w = torch.stack([tree(t0, t1) for tree in self.trees]) * (self.sign * sign) - + t0, t1 = t0.detach().cpu().float(), t1.detach().cpu().float() + w = torch.stack([tree(t0, t1) for tree in self.trees]).to(device=device, dtype=dtype) * (self.sign * sign) return w if self.batched else w[0] diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index 9dd1a43c1..7437e0567 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -600,7 +600,8 @@ def attention_flash(q, k, v, heads, mask=None, attn_precision=None, skip_reshape mask = mask.unsqueeze(1) try: - assert mask is None + if mask is not None: + raise RuntimeError("Mask must not be set for Flash attention") out = flash_attn_wrapper( q.transpose(1, 2), k.transpose(1, 2), From a39ac59c3e3fddc8b278899814f0bd5371abb11f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 15 Sep 2025 22:19:50 -0700 Subject: [PATCH 292/759] Add encoder part of whisper large v3 as an audio encoder model. (#9894) Not useful yet but some models use it. --- comfy/audio_encoders/audio_encoders.py | 58 +++++--- comfy/audio_encoders/whisper.py | 186 +++++++++++++++++++++++++ 2 files changed, 224 insertions(+), 20 deletions(-) create mode 100755 comfy/audio_encoders/whisper.py diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py index 6fb5b08e9..0550b2f9b 100644 --- a/comfy/audio_encoders/audio_encoders.py +++ b/comfy/audio_encoders/audio_encoders.py @@ -1,4 +1,5 @@ from .wav2vec2 import Wav2Vec2Model +from .whisper import WhisperLargeV3 import comfy.model_management import comfy.ops import comfy.utils @@ -11,13 +12,18 @@ class AudioEncoderModel(): self.load_device = comfy.model_management.text_encoder_device() offload_device = comfy.model_management.text_encoder_offload_device() self.dtype = comfy.model_management.text_encoder_dtype(self.load_device) + model_type = config.pop("model_type") model_config = dict(config) model_config.update({ "dtype": self.dtype, "device": offload_device, "operations": comfy.ops.manual_cast }) - self.model = Wav2Vec2Model(**model_config) + + if model_type == "wav2vec2": + self.model = Wav2Vec2Model(**model_config) + elif model_type == "whisper3": + self.model = WhisperLargeV3(**model_config) self.model.eval() self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device) self.model_sample_rate = 16000 @@ -40,33 +46,45 @@ class AudioEncoderModel(): def load_audio_encoder_from_sd(sd, prefix=""): sd = comfy.utils.state_dict_prefix_replace(sd, {"wav2vec2.": ""}) - embed_dim = sd["encoder.layer_norm.bias"].shape[0] - if embed_dim == 1024:# large - config = { - "embed_dim": 1024, - "num_heads": 16, - "num_layers": 24, - "conv_norm": True, - "conv_bias": True, - "do_normalize": True, - "do_stable_layer_norm": True + if "encoder.layer_norm.bias" in sd: #wav2vec2 + embed_dim = sd["encoder.layer_norm.bias"].shape[0] + if embed_dim == 1024:# large + config = { + "model_type": "wav2vec2", + "embed_dim": 1024, + "num_heads": 16, + "num_layers": 24, + "conv_norm": True, + "conv_bias": True, + "do_normalize": True, + "do_stable_layer_norm": True + } + elif embed_dim == 768: # base + config = { + "model_type": "wav2vec2", + "embed_dim": 768, + "num_heads": 12, + "num_layers": 12, + "conv_norm": False, + "conv_bias": False, + "do_normalize": False, # chinese-wav2vec2-base has this False + "do_stable_layer_norm": False } - elif embed_dim == 768: # base + else: + raise RuntimeError("ERROR: audio encoder file is invalid or unsupported embed_dim: {}".format(embed_dim)) + elif "model.encoder.embed_positions.weight" in sd: + sd = comfy.utils.state_dict_prefix_replace(sd, {"model.": ""}) config = { - "embed_dim": 768, - "num_heads": 12, - "num_layers": 12, - "conv_norm": False, - "conv_bias": False, - "do_normalize": False, # chinese-wav2vec2-base has this False - "do_stable_layer_norm": False + "model_type": "whisper3", } else: - raise RuntimeError("ERROR: audio encoder file is invalid or unsupported embed_dim: {}".format(embed_dim)) + raise RuntimeError("ERROR: audio encoder not supported.") audio_encoder = AudioEncoderModel(config) m, u = audio_encoder.load_sd(sd) if len(m) > 0: logging.warning("missing audio encoder: {}".format(m)) + if len(u) > 0: + logging.warning("unexpected audio encoder: {}".format(u)) return audio_encoder diff --git a/comfy/audio_encoders/whisper.py b/comfy/audio_encoders/whisper.py new file mode 100755 index 000000000..93d3782f1 --- /dev/null +++ b/comfy/audio_encoders/whisper.py @@ -0,0 +1,186 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio +from typing import Optional +from comfy.ldm.modules.attention import optimized_attention_masked +import comfy.ops + +class WhisperFeatureExtractor(nn.Module): + def __init__(self, n_mels=128, device=None): + super().__init__() + self.sample_rate = 16000 + self.n_fft = 400 + self.hop_length = 160 + self.n_mels = n_mels + self.chunk_length = 30 + self.n_samples = 480000 + + self.mel_spectrogram = torchaudio.transforms.MelSpectrogram( + sample_rate=self.sample_rate, + n_fft=self.n_fft, + hop_length=self.hop_length, + n_mels=self.n_mels, + f_min=0, + f_max=8000, + norm="slaney", + mel_scale="slaney", + ).to(device) + + def __call__(self, audio): + audio = torch.mean(audio, dim=1) + batch_size = audio.shape[0] + processed_audio = [] + + for i in range(batch_size): + aud = audio[i] + if aud.shape[0] > self.n_samples: + aud = aud[:self.n_samples] + elif aud.shape[0] < self.n_samples: + aud = F.pad(aud, (0, self.n_samples - aud.shape[0])) + processed_audio.append(aud) + + audio = torch.stack(processed_audio) + + mel_spec = self.mel_spectrogram(audio.to(self.mel_spectrogram.spectrogram.window.device))[:, :, :-1].to(audio.device) + + log_mel_spec = torch.clamp(mel_spec, min=1e-10).log10() + log_mel_spec = torch.maximum(log_mel_spec, log_mel_spec.max() - 8.0) + log_mel_spec = (log_mel_spec + 4.0) / 4.0 + + return log_mel_spec + + +class MultiHeadAttention(nn.Module): + def __init__(self, d_model: int, n_heads: int, dtype=None, device=None, operations=None): + super().__init__() + assert d_model % n_heads == 0 + + self.d_model = d_model + self.n_heads = n_heads + self.d_k = d_model // n_heads + + self.q_proj = operations.Linear(d_model, d_model, dtype=dtype, device=device) + self.k_proj = operations.Linear(d_model, d_model, bias=False, dtype=dtype, device=device) + self.v_proj = operations.Linear(d_model, d_model, dtype=dtype, device=device) + self.out_proj = operations.Linear(d_model, d_model, dtype=dtype, device=device) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + batch_size, seq_len, _ = query.shape + + q = self.q_proj(query) + k = self.k_proj(key) + v = self.v_proj(value) + + attn_output = optimized_attention_masked(q, k, v, self.n_heads, mask) + attn_output = self.out_proj(attn_output) + + return attn_output + + +class EncoderLayer(nn.Module): + def __init__(self, d_model: int, n_heads: int, d_ff: int, dtype=None, device=None, operations=None): + super().__init__() + + self.self_attn = MultiHeadAttention(d_model, n_heads, dtype=dtype, device=device, operations=operations) + self.self_attn_layer_norm = operations.LayerNorm(d_model, dtype=dtype, device=device) + + self.fc1 = operations.Linear(d_model, d_ff, dtype=dtype, device=device) + self.fc2 = operations.Linear(d_ff, d_model, dtype=dtype, device=device) + self.final_layer_norm = operations.LayerNorm(d_model, dtype=dtype, device=device) + + def forward( + self, + x: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None + ) -> torch.Tensor: + residual = x + x = self.self_attn_layer_norm(x) + x = self.self_attn(x, x, x, attention_mask) + x = residual + x + + residual = x + x = self.final_layer_norm(x) + x = self.fc1(x) + x = F.gelu(x) + x = self.fc2(x) + x = residual + x + + return x + + +class AudioEncoder(nn.Module): + def __init__( + self, + n_mels: int = 128, + n_ctx: int = 1500, + n_state: int = 1280, + n_head: int = 20, + n_layer: int = 32, + dtype=None, + device=None, + operations=None + ): + super().__init__() + + self.conv1 = operations.Conv1d(n_mels, n_state, kernel_size=3, padding=1, dtype=dtype, device=device) + self.conv2 = operations.Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1, dtype=dtype, device=device) + + self.embed_positions = operations.Embedding(n_ctx, n_state, dtype=dtype, device=device) + + self.layers = nn.ModuleList([ + EncoderLayer(n_state, n_head, n_state * 4, dtype=dtype, device=device, operations=operations) + for _ in range(n_layer) + ]) + + self.layer_norm = operations.LayerNorm(n_state, dtype=dtype, device=device) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = F.gelu(self.conv1(x)) + x = F.gelu(self.conv2(x)) + + x = x.transpose(1, 2) + + x = x + comfy.ops.cast_to_input(self.embed_positions.weight[:, :x.shape[1]], x) + + all_x = () + for layer in self.layers: + all_x += (x,) + x = layer(x) + + x = self.layer_norm(x) + all_x += (x,) + return x, all_x + + +class WhisperLargeV3(nn.Module): + def __init__( + self, + n_mels: int = 128, + n_audio_ctx: int = 1500, + n_audio_state: int = 1280, + n_audio_head: int = 20, + n_audio_layer: int = 32, + dtype=None, + device=None, + operations=None + ): + super().__init__() + + self.feature_extractor = WhisperFeatureExtractor(n_mels=n_mels, device=device) + + self.encoder = AudioEncoder( + n_mels, n_audio_ctx, n_audio_state, n_audio_head, n_audio_layer, + dtype=dtype, device=device, operations=operations + ) + + def forward(self, audio): + mel = self.feature_extractor(audio) + x, all_x = self.encoder(mel) + return x, all_x From e42682b24ef033a93001ba27cc5c5aa461a61d8d Mon Sep 17 00:00:00 2001 From: rattus128 <46076784+rattus128@users.noreply.github.com> Date: Wed, 17 Sep 2025 09:21:14 +1000 Subject: [PATCH 293/759] Reduce Peak WAN inference VRAM usage (#9898) * flux: Do the xq and xk ropes one at a time This was doing independendent interleaved tensor math on the q and k tensors, leading to the holding of more than the minimum intermediates in VRAM. On a bad day, it would VRAM OOM on xk intermediates. Do everything q and then everything k, so torch can garbage collect all of qs intermediates before k allocates its intermediates. This reduces peak VRAM usage for some WAN2.2 inferences (at least). * wan: Optimize qkv intermediates on attention As commented. The former logic computed independent pieces of QKV in parallel which help more inference intermediates in VRAM spiking VRAM usage. Fully roping Q and garbage collecting the intermediates before touching K reduces the peak inference VRAM usage. --- comfy/ldm/flux/math.py | 11 +++++------ comfy/ldm/wan/model.py | 22 +++++++++++++--------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/comfy/ldm/flux/math.py b/comfy/ldm/flux/math.py index 4d743cda2..fb7cd7586 100644 --- a/comfy/ldm/flux/math.py +++ b/comfy/ldm/flux/math.py @@ -35,11 +35,10 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor: out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2) return out.to(dtype=torch.float32, device=pos.device) +def apply_rope1(x: Tensor, freqs_cis: Tensor): + x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2) + x_out = freqs_cis[..., 0] * x_[..., 0] + freqs_cis[..., 1] * x_[..., 1] + return x_out.reshape(*x.shape).type_as(x) def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor): - xq_ = xq.to(dtype=freqs_cis.dtype).reshape(*xq.shape[:-1], -1, 1, 2) - xk_ = xk.to(dtype=freqs_cis.dtype).reshape(*xk.shape[:-1], -1, 1, 2) - xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1] - xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1] - return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk) - + return apply_rope1(xq, freqs_cis), apply_rope1(xk, freqs_cis) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 63472ada2..67dcf8f1e 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -8,7 +8,7 @@ from einops import rearrange from comfy.ldm.modules.attention import optimized_attention from comfy.ldm.flux.layers import EmbedND -from comfy.ldm.flux.math import apply_rope +from comfy.ldm.flux.math import apply_rope1 import comfy.ldm.common_dit import comfy.model_management import comfy.patcher_extension @@ -60,20 +60,24 @@ class WanSelfAttention(nn.Module): """ b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim - # query, key, value function - def qkv_fn(x): + def qkv_fn_q(x): q = self.norm_q(self.q(x)).view(b, s, n, d) - k = self.norm_k(self.k(x)).view(b, s, n, d) - v = self.v(x).view(b, s, n * d) - return q, k, v + return apply_rope1(q, freqs) - q, k, v = qkv_fn(x) - q, k = apply_rope(q, k, freqs) + def qkv_fn_k(x): + k = self.norm_k(self.k(x)).view(b, s, n, d) + return apply_rope1(k, freqs) + + #These two are VRAM hogs, so we want to do all of q computation and + #have pytorch garbage collect the intermediates on the sub function + #return before we touch k + q = qkv_fn_q(x) + k = qkv_fn_k(x) x = optimized_attention( q.view(b, s, n * d), k.view(b, s, n * d), - v, + self.v(x).view(b, s, n * d), heads=self.num_heads, transformer_options=transformer_options, ) From 9288c78fc5fae74d3fa7787736dea442e996303f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:12:48 -0700 Subject: [PATCH 294/759] Support the HuMo model. (#9903) --- comfy/audio_encoders/audio_encoders.py | 1 + comfy/ldm/wan/model.py | 259 ++++++++++++++++++++++++- comfy/model_base.py | 17 ++ comfy/model_detection.py | 2 + comfy/supported_models.py | 12 +- comfy_extras/nodes_wan.py | 98 ++++++++++ 6 files changed, 383 insertions(+), 6 deletions(-) diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py index 0550b2f9b..46ef21c95 100644 --- a/comfy/audio_encoders/audio_encoders.py +++ b/comfy/audio_encoders/audio_encoders.py @@ -41,6 +41,7 @@ class AudioEncoderModel(): outputs = {} outputs["encoded_audio"] = out outputs["encoded_audio_all_layers"] = all_layers + outputs["audio_samples"] = audio.shape[2] return outputs diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 67dcf8f1e..b3b7da5d5 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -34,7 +34,9 @@ class WanSelfAttention(nn.Module): num_heads, window_size=(-1, -1), qk_norm=True, - eps=1e-6, operation_settings={}): + eps=1e-6, + kv_dim=None, + operation_settings={}): assert dim % num_heads == 0 super().__init__() self.dim = dim @@ -43,11 +45,13 @@ class WanSelfAttention(nn.Module): self.window_size = window_size self.qk_norm = qk_norm self.eps = eps + if kv_dim is None: + kv_dim = dim # layers self.q = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) - self.k = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) - self.v = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.k = operation_settings.get("operations").Linear(kv_dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.v = operation_settings.get("operations").Linear(kv_dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) self.o = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) self.norm_q = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity() self.norm_k = operation_settings.get("operations").RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity() @@ -402,6 +406,7 @@ class WanModel(torch.nn.Module): eps=1e-6, flf_pos_embed_token_number=None, in_dim_ref_conv=None, + wan_attn_block_class=WanAttentionBlock, image_model=None, device=None, dtype=None, @@ -479,8 +484,8 @@ class WanModel(torch.nn.Module): # blocks cross_attn_type = 't2v_cross_attn' if model_type == 't2v' else 'i2v_cross_attn' self.blocks = nn.ModuleList([ - WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads, - window_size, qk_norm, cross_attn_norm, eps, operation_settings=operation_settings) + wan_attn_block_class(cross_attn_type, dim, ffn_dim, num_heads, + window_size, qk_norm, cross_attn_norm, eps, operation_settings=operation_settings) for _ in range(num_layers) ]) @@ -1325,3 +1330,247 @@ class WanModel_S2V(WanModel): # unpatchify x = self.unpatchify(x, grid_sizes) return x + + +class WanT2VCrossAttentionGather(WanSelfAttention): + + def forward(self, x, context, transformer_options={}, **kwargs): + r""" + Args: + x(Tensor): Shape [B, L1, C] - video tokens + context(Tensor): Shape [B, L2, C] - audio tokens with shape [B, frames*16, 1536] + """ + b, n, d = x.size(0), self.num_heads, self.head_dim + + q = self.norm_q(self.q(x)) + k = self.norm_k(self.k(context)) + v = self.v(context) + + # Handle audio temporal structure (16 tokens per frame) + k = k.reshape(-1, 16, n, d).transpose(1, 2) + v = v.reshape(-1, 16, n, d).transpose(1, 2) + + # Handle video spatial structure + q = q.reshape(k.shape[0], -1, n, d).transpose(1, 2) + + x = optimized_attention(q, k, v, heads=self.num_heads, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options) + + x = x.transpose(1, 2).view(b, -1, n, d).flatten(2) + x = self.o(x) + return x + + +class AudioCrossAttentionWrapper(nn.Module): + def __init__(self, dim, kv_dim, num_heads, qk_norm=True, eps=1e-6, operation_settings={}): + super().__init__() + + self.audio_cross_attn = WanT2VCrossAttentionGather(dim, num_heads, qk_norm, kv_dim, eps, operation_settings=operation_settings) + self.norm1_audio = operation_settings.get("operations").LayerNorm(dim, eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + + def forward(self, x, audio, transformer_options={}): + x = x + self.audio_cross_attn(self.norm1_audio(x), audio, transformer_options=transformer_options) + return x + + +class WanAttentionBlockAudio(WanAttentionBlock): + + def __init__(self, + cross_attn_type, + dim, + ffn_dim, + num_heads, + window_size=(-1, -1), + qk_norm=True, + cross_attn_norm=False, + eps=1e-6, operation_settings={}): + super().__init__(cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps, operation_settings) + self.audio_cross_attn_wrapper = AudioCrossAttentionWrapper(dim, 1536, num_heads, qk_norm, eps, operation_settings=operation_settings) + + def forward( + self, + x, + e, + freqs, + context, + context_img_len=257, + audio=None, + transformer_options={}, + ): + r""" + Args: + x(Tensor): Shape [B, L, C] + e(Tensor): Shape [B, 6, C] + freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2] + """ + # assert e.dtype == torch.float32 + + if e.ndim < 4: + e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1) + else: + e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e).unbind(2) + # assert e[0].dtype == torch.float32 + + # self-attention + y = self.self_attn( + torch.addcmul(repeat_e(e[0], x), self.norm1(x), 1 + repeat_e(e[1], x)), + freqs, transformer_options=transformer_options) + + x = torch.addcmul(x, y, repeat_e(e[2], x)) + + # cross-attention & ffn + x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len, transformer_options=transformer_options) + if audio is not None: + x = self.audio_cross_attn_wrapper(x, audio, transformer_options=transformer_options) + y = self.ffn(torch.addcmul(repeat_e(e[3], x), self.norm2(x), 1 + repeat_e(e[4], x))) + x = torch.addcmul(x, y, repeat_e(e[5], x)) + return x + +class DummyAdapterLayer(nn.Module): + def __init__(self, layer): + super().__init__() + self.layer = layer + + def forward(self, *args, **kwargs): + return self.layer(*args, **kwargs) + + +class AudioProjModel(nn.Module): + def __init__( + self, + seq_len=5, + blocks=13, # add a new parameter blocks + channels=768, # add a new parameter channels + intermediate_dim=512, + output_dim=1536, + context_tokens=16, + device=None, + dtype=None, + operations=None, + ): + super().__init__() + + self.seq_len = seq_len + self.blocks = blocks + self.channels = channels + self.input_dim = seq_len * blocks * channels # update input_dim to be the product of blocks and channels. + self.intermediate_dim = intermediate_dim + self.context_tokens = context_tokens + self.output_dim = output_dim + + # define multiple linear layers + self.audio_proj_glob_1 = DummyAdapterLayer(operations.Linear(self.input_dim, intermediate_dim, dtype=dtype, device=device)) + self.audio_proj_glob_2 = DummyAdapterLayer(operations.Linear(intermediate_dim, intermediate_dim, dtype=dtype, device=device)) + self.audio_proj_glob_3 = DummyAdapterLayer(operations.Linear(intermediate_dim, context_tokens * output_dim, dtype=dtype, device=device)) + + self.audio_proj_glob_norm = DummyAdapterLayer(operations.LayerNorm(output_dim, dtype=dtype, device=device)) + + def forward(self, audio_embeds): + video_length = audio_embeds.shape[1] + audio_embeds = rearrange(audio_embeds, "bz f w b c -> (bz f) w b c") + batch_size, window_size, blocks, channels = audio_embeds.shape + audio_embeds = audio_embeds.view(batch_size, window_size * blocks * channels) + + audio_embeds = torch.relu(self.audio_proj_glob_1(audio_embeds)) + audio_embeds = torch.relu(self.audio_proj_glob_2(audio_embeds)) + + context_tokens = self.audio_proj_glob_3(audio_embeds).reshape(batch_size, self.context_tokens, self.output_dim) + + context_tokens = self.audio_proj_glob_norm(context_tokens) + context_tokens = rearrange(context_tokens, "(bz f) m c -> bz f m c", f=video_length) + + return context_tokens + + +class HumoWanModel(WanModel): + r""" + Wan diffusion backbone supporting both text-to-video and image-to-video. + """ + + def __init__(self, + model_type='humo', + patch_size=(1, 2, 2), + text_len=512, + in_dim=16, + dim=2048, + ffn_dim=8192, + freq_dim=256, + text_dim=4096, + out_dim=16, + num_heads=16, + num_layers=32, + window_size=(-1, -1), + qk_norm=True, + cross_attn_norm=True, + eps=1e-6, + flf_pos_embed_token_number=None, + image_model=None, + audio_token_num=16, + device=None, + dtype=None, + operations=None, + ): + + super().__init__(model_type='t2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, wan_attn_block_class=WanAttentionBlockAudio, image_model=image_model, device=device, dtype=dtype, operations=operations) + + self.audio_proj = AudioProjModel(seq_len=8, blocks=5, channels=1280, intermediate_dim=512, output_dim=1536, context_tokens=audio_token_num, dtype=dtype, device=device, operations=operations) + + def forward_orig( + self, + x, + t, + context, + freqs=None, + audio_embed=None, + reference_latent=None, + transformer_options={}, + **kwargs, + ): + bs, _, time, height, width = x.shape + + # embeddings + x = self.patch_embedding(x.float()).to(x.dtype) + grid_sizes = x.shape[2:] + x = x.flatten(2).transpose(1, 2) + + # time embeddings + e = self.time_embedding( + sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype)) + e = e.reshape(t.shape[0], -1, e.shape[-1]) + e0 = self.time_projection(e).unflatten(2, (6, self.dim)) + + if reference_latent is not None: + ref = self.patch_embedding(reference_latent.float()).to(x.dtype) + ref = ref.flatten(2).transpose(1, 2) + freqs_ref = self.rope_encode(reference_latent.shape[-3], reference_latent.shape[-2], reference_latent.shape[-1], t_start=time, device=x.device, dtype=x.dtype) + x = torch.cat([x, ref], dim=1) + freqs = torch.cat([freqs, freqs_ref], dim=1) + del ref, freqs_ref + + # context + context = self.text_embedding(context) + context_img_len = None + + if audio_embed is not None: + audio = self.audio_proj(audio_embed).permute(0, 3, 1, 2).flatten(2).transpose(1, 2) + else: + audio = None + + patches_replace = transformer_options.get("patches_replace", {}) + blocks_replace = patches_replace.get("dit", {}) + for i, block in enumerate(self.blocks): + if ("double_block", i) in blocks_replace: + def block_wrap(args): + out = {} + out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, audio=audio, transformer_options=args["transformer_options"]) + return out + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap}) + x = out["img"] + else: + x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, audio=audio, transformer_options=transformer_options) + + # head + x = self.head(x, e) + + # unpatchify + x = self.unpatchify(x, grid_sizes) + return x diff --git a/comfy/model_base.py b/comfy/model_base.py index 252dfcf69..cf99035da 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1213,6 +1213,23 @@ class WAN21_Camera(WAN21): out['camera_conditions'] = comfy.conds.CONDRegular(camera_conditions) return out +class WAN21_HuMo(WAN21): + def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): + super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.HumoWanModel) + self.image_to_video = image_to_video + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + + audio_embed = kwargs.get("audio_embed", None) + if audio_embed is not None: + out['audio_embed'] = comfy.conds.CONDRegular(audio_embed) + + reference_latents = kwargs.get("reference_latents", None) + if reference_latents is not None: + out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])) + return out + class WAN22_S2V(WAN21): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel_S2V) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 03d44f65e..72621bed6 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -402,6 +402,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["model_type"] = "camera_2.2" elif '{}casual_audio_encoder.encoder.final_linear.weight'.format(key_prefix) in state_dict_keys: dit_config["model_type"] = "s2v" + elif '{}audio_proj.audio_proj_glob_1.layer.bias'.format(key_prefix) in state_dict_keys: + dit_config["model_type"] = "humo" else: if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys: dit_config["model_type"] = "i2v" diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 557902d11..213b5b92c 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1073,6 +1073,16 @@ class WAN21_Vace(WAN21_T2V): out = model_base.WAN21_Vace(self, image_to_video=False, device=device) return out +class WAN21_HuMo(WAN21_T2V): + unet_config = { + "image_model": "wan2.1", + "model_type": "humo", + } + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.WAN21_HuMo(self, image_to_video=False, device=device) + return out + class WAN22_S2V(WAN21_T2V): unet_config = { "image_model": "wan2.1", @@ -1351,6 +1361,6 @@ class HunyuanImage21Refiner(HunyuanVideo): out = model_base.HunyuanImage21Refiner(self, device=device) return out -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 4f73369f5..0b8b55813 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -1015,6 +1015,103 @@ class WanSoundImageToVideoExtend(io.ComfyNode): return io.NodeOutput(positive, negative, out_latent) +def get_audio_emb_window(audio_emb, frame_num, frame0_idx, audio_shift=2): + zero_audio_embed = torch.zeros((audio_emb.shape[1], audio_emb.shape[2]), dtype=audio_emb.dtype, device=audio_emb.device) + zero_audio_embed_3 = torch.zeros((3, audio_emb.shape[1], audio_emb.shape[2]), dtype=audio_emb.dtype, device=audio_emb.device) # device=audio_emb.device + iter_ = 1 + (frame_num - 1) // 4 + audio_emb_wind = [] + for lt_i in range(iter_): + if lt_i == 0: + st = frame0_idx + lt_i - 2 + ed = frame0_idx + lt_i + 3 + wind_feat = torch.stack([ + audio_emb[i] if (0 <= i < audio_emb.shape[0]) else zero_audio_embed + for i in range(st, ed) + ], dim=0) + wind_feat = torch.cat((zero_audio_embed_3, wind_feat), dim=0) + else: + st = frame0_idx + 1 + 4 * (lt_i - 1) - audio_shift + ed = frame0_idx + 1 + 4 * lt_i + audio_shift + wind_feat = torch.stack([ + audio_emb[i] if (0 <= i < audio_emb.shape[0]) else zero_audio_embed + for i in range(st, ed) + ], dim=0) + audio_emb_wind.append(wind_feat) + audio_emb_wind = torch.stack(audio_emb_wind, dim=0) + + return audio_emb_wind, ed - audio_shift + + +class WanHuMoImageToVideo(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanHuMoImageToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=97, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.AudioEncoderOutput.Input("audio_encoder_output", optional=True), + io.Image.Input("ref_image", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + is_experimental=True, + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, audio_encoder_output=None) -> io.NodeOutput: + latent_t = ((length - 1) // 4) + 1 + latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + + if ref_image is not None: + ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + ref_latent = vae.encode(ref_image[:, :, :, :3]) + positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True) + negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [torch.zeros_like(ref_latent)]}, append=True) + else: + zero_latent = torch.zeros([batch_size, 16, 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [zero_latent]}, append=True) + negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [zero_latent]}, append=True) + + if audio_encoder_output is not None: + audio_emb = torch.stack(audio_encoder_output["encoded_audio_all_layers"], dim=2) + audio_len = audio_encoder_output["audio_samples"] // 640 + audio_emb = audio_emb[:, :audio_len * 2] + + feat0 = linear_interpolation(audio_emb[:, :, 0: 8].mean(dim=2), 50, 25) + feat1 = linear_interpolation(audio_emb[:, :, 8: 16].mean(dim=2), 50, 25) + feat2 = linear_interpolation(audio_emb[:, :, 16: 24].mean(dim=2), 50, 25) + feat3 = linear_interpolation(audio_emb[:, :, 24: 32].mean(dim=2), 50, 25) + feat4 = linear_interpolation(audio_emb[:, :, 32], 50, 25) + audio_emb = torch.stack([feat0, feat1, feat2, feat3, feat4], dim=2)[0] # [T, 5, 1280] + audio_emb, _ = get_audio_emb_window(audio_emb, length, frame0_idx=0) + + # pad for ref latent + zero_audio_pad = torch.zeros(ref_latent.shape[2], *audio_emb.shape[1:], device=audio_emb.device, dtype=audio_emb.dtype) + audio_emb = torch.cat([audio_emb, zero_audio_pad], dim=0) + + audio_emb = audio_emb.unsqueeze(0) + audio_emb_neg = torch.zeros_like(audio_emb) + positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_emb}) + negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_emb_neg}) + else: + zero_audio = torch.zeros([batch_size, latent_t + 1, 8, 5, 1280], device=comfy.model_management.intermediate_device()) + positive = node_helpers.conditioning_set_values(positive, {"audio_embed": zero_audio}) + negative = node_helpers.conditioning_set_values(negative, {"audio_embed": zero_audio}) + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent) + class Wan22ImageToVideoLatent(io.ComfyNode): @classmethod def define_schema(cls): @@ -1075,6 +1172,7 @@ class WanExtension(ComfyExtension): WanPhantomSubjectToVideo, WanSoundImageToVideo, WanSoundImageToVideoExtend, + WanHuMoImageToVideo, Wan22ImageToVideoLatent, ] From dd611a7700956f45f393dee32fb8505de176dc66 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 17 Sep 2025 15:39:24 -0700 Subject: [PATCH 295/759] Support the HuMo 17B model. (#9912) --- comfy/ldm/wan/model.py | 2 +- comfy/model_base.py | 29 ++++++++++++++++++++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index b3b7da5d5..9cf3c171d 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -1364,7 +1364,7 @@ class AudioCrossAttentionWrapper(nn.Module): def __init__(self, dim, kv_dim, num_heads, qk_norm=True, eps=1e-6, operation_settings={}): super().__init__() - self.audio_cross_attn = WanT2VCrossAttentionGather(dim, num_heads, qk_norm, kv_dim, eps, operation_settings=operation_settings) + self.audio_cross_attn = WanT2VCrossAttentionGather(dim, num_heads, qk_norm=qk_norm, kv_dim=kv_dim, eps=eps, operation_settings=operation_settings) self.norm1_audio = operation_settings.get("operations").LayerNorm(dim, eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) def forward(self, x, audio, transformer_options={}): diff --git a/comfy/model_base.py b/comfy/model_base.py index cf99035da..70b67b7c1 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1220,14 +1220,37 @@ class WAN21_HuMo(WAN21): def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) + noise = kwargs.get("noise", None) audio_embed = kwargs.get("audio_embed", None) if audio_embed is not None: out['audio_embed'] = comfy.conds.CONDRegular(audio_embed) - reference_latents = kwargs.get("reference_latents", None) - if reference_latents is not None: - out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])) + if "c_concat" not in out: # 1.7B model + reference_latents = kwargs.get("reference_latents", None) + if reference_latents is not None: + out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1])) + else: + noise_shape = list(noise.shape) + noise_shape[1] += 4 + concat_latent = torch.zeros(noise_shape, device=noise.device, dtype=noise.dtype) + zero_vae_values_first = torch.tensor([0.8660, -0.4326, -0.0017, -0.4884, -0.5283, 0.9207, -0.9896, 0.4433, -0.5543, -0.0113, 0.5753, -0.6000, -0.8346, -0.3497, -0.1926, -0.6938]).view(1, 16, 1, 1, 1) + zero_vae_values_second = torch.tensor([1.0869, -1.2370, 0.0206, -0.4357, -0.6411, 2.0307, -1.5972, 1.2659, -0.8595, -0.4654, 0.9638, -1.6330, -1.4310, -0.1098, -0.3856, -1.4583]).view(1, 16, 1, 1, 1) + zero_vae_values = torch.tensor([0.8642, -1.8583, 0.1577, 0.1350, -0.3641, 2.5863, -1.9670, 1.6065, -1.0475, -0.8678, 1.1734, -1.8138, -1.5933, -0.7721, -0.3289, -1.3745]).view(1, 16, 1, 1, 1) + concat_latent[:, 4:] = zero_vae_values + concat_latent[:, 4:, :1] = zero_vae_values_first + concat_latent[:, 4:, 1:2] = zero_vae_values_second + out['c_concat'] = comfy.conds.CONDNoiseShape(concat_latent) + reference_latents = kwargs.get("reference_latents", None) + if reference_latents is not None: + ref_latent = self.process_latent_in(reference_latents[-1]) + ref_latent_shape = list(ref_latent.shape) + ref_latent_shape[1] += 4 + ref_latent_shape[1] + ref_latent_full = torch.zeros(ref_latent_shape, device=ref_latent.device, dtype=ref_latent.dtype) + ref_latent_full[:, 20:] = ref_latent + ref_latent_full[:, 16:20] = 1.0 + out['reference_latent'] = comfy.conds.CONDRegular(ref_latent_full) + return out class WAN22_S2V(WAN21): From 8d6653fca676a08df3e11654672fed92a183d147 Mon Sep 17 00:00:00 2001 From: DELUXA Date: Fri, 19 Sep 2025 02:50:37 +0300 Subject: [PATCH 296/759] Enable fp8 ops by default on gfx1200 (#9926) --- comfy/model_management.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index bbfc3c7a1..d880f1970 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -348,7 +348,7 @@ try: # if any((a in arch) for a in ["gfx1201"]): # ENABLE_PYTORCH_ATTENTION = True if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4): - if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]): # TODO: more arches + if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx942", "gfx950"]): # TODO: more arches SUPPORT_FP8_OPS = True except: From 1ea8c540640913b247248e46c907fb9b92a9dd4b Mon Sep 17 00:00:00 2001 From: Jodh Singh Date: Thu, 18 Sep 2025 19:51:16 -0400 Subject: [PATCH 297/759] make kernel of same type as image to avoid mismatch issues (#9932) --- comfy_extras/nodes_post_processing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py index cb1a0d883..ed7a07152 100644 --- a/comfy_extras/nodes_post_processing.py +++ b/comfy_extras/nodes_post_processing.py @@ -233,6 +233,7 @@ class Sharpen: kernel_size = sharpen_radius * 2 + 1 kernel = gaussian_kernel(kernel_size, sigma, device=image.device) * -(alpha*10) + kernel = kernel.to(dtype=image.dtype) center = kernel_size // 2 kernel[center, center] = kernel[center, center] - kernel.sum() + 1.0 kernel = kernel.repeat(channels, 1, 1).unsqueeze(1) From 24b0fce099c56d18ceb1f4f6b9455fee55e154ce Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 18 Sep 2025 16:54:16 -0700 Subject: [PATCH 298/759] Do padding of audio embed in model for humo for more flexibility. (#9935) --- comfy/ldm/wan/model.py | 3 +++ comfy_extras/nodes_wan.py | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 9cf3c171d..2dac5980c 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -1551,6 +1551,9 @@ class HumoWanModel(WanModel): context_img_len = None if audio_embed is not None: + if reference_latent is not None: + zero_audio_pad = torch.zeros(audio_embed.shape[0], reference_latent.shape[-3], *audio_embed.shape[2:], device=audio_embed.device, dtype=audio_embed.dtype) + audio_embed = torch.cat([audio_embed, zero_audio_pad], dim=1) audio = self.audio_proj(audio_embed).permute(0, 3, 1, 2).flatten(2).transpose(1, 2) else: audio = None diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 0b8b55813..5f10edcff 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -1095,10 +1095,6 @@ class WanHuMoImageToVideo(io.ComfyNode): audio_emb = torch.stack([feat0, feat1, feat2, feat3, feat4], dim=2)[0] # [T, 5, 1280] audio_emb, _ = get_audio_emb_window(audio_emb, length, frame0_idx=0) - # pad for ref latent - zero_audio_pad = torch.zeros(ref_latent.shape[2], *audio_emb.shape[1:], device=audio_emb.device, dtype=audio_emb.dtype) - audio_emb = torch.cat([audio_emb, zero_audio_pad], dim=0) - audio_emb = audio_emb.unsqueeze(0) audio_emb_neg = torch.zeros_like(audio_emb) positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_emb}) From 711bcf33ee505a997674f4a9125e69d2a5a3c180 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Fri, 19 Sep 2025 00:03:30 -0700 Subject: [PATCH 299/759] Bump frontend to 1.26.13 (#9933) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index de5af5fac..79187efaa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.26.11 +comfyui-frontend-package==1.26.13 comfyui-workflow-templates==0.1.81 comfyui-embedded-docs==0.2.6 torch From dc95b6acc0ef4962460592d417db4024f7160586 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 19 Sep 2025 00:07:17 -0700 Subject: [PATCH 300/759] Basic WIP support for the wan animate model. (#9939) --- comfy/ldm/wan/model_animate.py | 548 +++++++++++++++++++++++++++++++++ comfy/model_base.py | 18 ++ comfy/model_detection.py | 2 + comfy/supported_models.py | 15 +- comfy_extras/nodes_wan.py | 84 +++++ 5 files changed, 666 insertions(+), 1 deletion(-) create mode 100644 comfy/ldm/wan/model_animate.py diff --git a/comfy/ldm/wan/model_animate.py b/comfy/ldm/wan/model_animate.py new file mode 100644 index 000000000..542f54110 --- /dev/null +++ b/comfy/ldm/wan/model_animate.py @@ -0,0 +1,548 @@ +from torch import nn +import torch +from typing import Tuple, Optional +from einops import rearrange +import torch.nn.functional as F +import math +from .model import WanModel, sinusoidal_embedding_1d +from comfy.ldm.modules.attention import optimized_attention +import comfy.model_management + +class CausalConv1d(nn.Module): + + def __init__(self, chan_in, chan_out, kernel_size=3, stride=1, dilation=1, pad_mode="replicate", operations=None, **kwargs): + super().__init__() + + self.pad_mode = pad_mode + padding = (kernel_size - 1, 0) # T + self.time_causal_padding = padding + + self.conv = operations.Conv1d(chan_in, chan_out, kernel_size, stride=stride, dilation=dilation, **kwargs) + + def forward(self, x): + x = F.pad(x, self.time_causal_padding, mode=self.pad_mode) + return self.conv(x) + + +class FaceEncoder(nn.Module): + def __init__(self, in_dim: int, hidden_dim: int, num_heads=int, dtype=None, device=None, operations=None): + factory_kwargs = {"dtype": dtype, "device": device} + super().__init__() + + self.num_heads = num_heads + self.conv1_local = CausalConv1d(in_dim, 1024 * num_heads, 3, stride=1, operations=operations, **factory_kwargs) + self.norm1 = operations.LayerNorm(hidden_dim // 8, elementwise_affine=False, eps=1e-6, **factory_kwargs) + self.act = nn.SiLU() + self.conv2 = CausalConv1d(1024, 1024, 3, stride=2, operations=operations, **factory_kwargs) + self.conv3 = CausalConv1d(1024, 1024, 3, stride=2, operations=operations, **factory_kwargs) + + self.out_proj = operations.Linear(1024, hidden_dim, **factory_kwargs) + self.norm1 = operations.LayerNorm(1024, elementwise_affine=False, eps=1e-6, **factory_kwargs) + + self.norm2 = operations.LayerNorm(1024, elementwise_affine=False, eps=1e-6, **factory_kwargs) + + self.norm3 = operations.LayerNorm(1024, elementwise_affine=False, eps=1e-6, **factory_kwargs) + + self.padding_tokens = nn.Parameter(torch.empty(1, 1, 1, hidden_dim, **factory_kwargs)) + + def forward(self, x): + + x = rearrange(x, "b t c -> b c t") + b, c, t = x.shape + + x = self.conv1_local(x) + x = rearrange(x, "b (n c) t -> (b n) t c", n=self.num_heads) + + x = self.norm1(x) + x = self.act(x) + x = rearrange(x, "b t c -> b c t") + x = self.conv2(x) + x = rearrange(x, "b c t -> b t c") + x = self.norm2(x) + x = self.act(x) + x = rearrange(x, "b t c -> b c t") + x = self.conv3(x) + x = rearrange(x, "b c t -> b t c") + x = self.norm3(x) + x = self.act(x) + x = self.out_proj(x) + x = rearrange(x, "(b n) t c -> b t n c", b=b) + padding = comfy.model_management.cast_to(self.padding_tokens, dtype=x.dtype, device=x.device).repeat(b, x.shape[1], 1, 1) + x = torch.cat([x, padding], dim=-2) + x_local = x.clone() + + return x_local + + +def get_norm_layer(norm_layer, operations=None): + """ + Get the normalization layer. + + Args: + norm_layer (str): The type of normalization layer. + + Returns: + norm_layer (nn.Module): The normalization layer. + """ + if norm_layer == "layer": + return operations.LayerNorm + elif norm_layer == "rms": + return operations.RMSNorm + else: + raise NotImplementedError(f"Norm layer {norm_layer} is not implemented") + + +class FaceAdapter(nn.Module): + def __init__( + self, + hidden_dim: int, + heads_num: int, + qk_norm: bool = True, + qk_norm_type: str = "rms", + num_adapter_layers: int = 1, + dtype=None, device=None, operations=None + ): + + factory_kwargs = {"dtype": dtype, "device": device} + super().__init__() + self.hidden_size = hidden_dim + self.heads_num = heads_num + self.fuser_blocks = nn.ModuleList( + [ + FaceBlock( + self.hidden_size, + self.heads_num, + qk_norm=qk_norm, + qk_norm_type=qk_norm_type, + operations=operations, + **factory_kwargs, + ) + for _ in range(num_adapter_layers) + ] + ) + + def forward( + self, + x: torch.Tensor, + motion_embed: torch.Tensor, + idx: int, + freqs_cis_q: Tuple[torch.Tensor, torch.Tensor] = None, + freqs_cis_k: Tuple[torch.Tensor, torch.Tensor] = None, + ) -> torch.Tensor: + + return self.fuser_blocks[idx](x, motion_embed, freqs_cis_q, freqs_cis_k) + + + +class FaceBlock(nn.Module): + def __init__( + self, + hidden_size: int, + heads_num: int, + qk_norm: bool = True, + qk_norm_type: str = "rms", + qk_scale: float = None, + dtype: Optional[torch.dtype] = None, + device: Optional[torch.device] = None, + operations=None + ): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + + self.deterministic = False + self.hidden_size = hidden_size + self.heads_num = heads_num + head_dim = hidden_size // heads_num + self.scale = qk_scale or head_dim**-0.5 + + self.linear1_kv = operations.Linear(hidden_size, hidden_size * 2, **factory_kwargs) + self.linear1_q = operations.Linear(hidden_size, hidden_size, **factory_kwargs) + + self.linear2 = operations.Linear(hidden_size, hidden_size, **factory_kwargs) + + qk_norm_layer = get_norm_layer(qk_norm_type, operations=operations) + self.q_norm = ( + qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity() + ) + self.k_norm = ( + qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs) if qk_norm else nn.Identity() + ) + + self.pre_norm_feat = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs) + + self.pre_norm_motion = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **factory_kwargs) + + def forward( + self, + x: torch.Tensor, + motion_vec: torch.Tensor, + motion_mask: Optional[torch.Tensor] = None, + # use_context_parallel=False, + ) -> torch.Tensor: + + B, T, N, C = motion_vec.shape + T_comp = T + + x_motion = self.pre_norm_motion(motion_vec) + x_feat = self.pre_norm_feat(x) + + kv = self.linear1_kv(x_motion) + q = self.linear1_q(x_feat) + + k, v = rearrange(kv, "B L N (K H D) -> K B L N H D", K=2, H=self.heads_num) + q = rearrange(q, "B S (H D) -> B S H D", H=self.heads_num) + + # Apply QK-Norm if needed. + q = self.q_norm(q).to(v) + k = self.k_norm(k).to(v) + + k = rearrange(k, "B L N H D -> (B L) N H D") + v = rearrange(v, "B L N H D -> (B L) N H D") + + q = rearrange(q, "B (L S) H D -> (B L) S (H D)", L=T_comp) + + attn = optimized_attention(q, k, v, heads=self.heads_num) + + attn = rearrange(attn, "(B L) S C -> B (L S) C", L=T_comp) + + output = self.linear2(attn) + + if motion_mask is not None: + output = output * rearrange(motion_mask, "B T H W -> B (T H W)").unsqueeze(-1) + + return output + +# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/ops/upfirdn2d/upfirdn2d.py#L162 +def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1): + _, minor, in_h, in_w = input.shape + kernel_h, kernel_w = kernel.shape + + out = input.view(-1, minor, in_h, 1, in_w, 1) + out = F.pad(out, [0, up_x - 1, 0, 0, 0, up_y - 1, 0, 0]) + out = out.view(-1, minor, in_h * up_y, in_w * up_x) + + out = F.pad(out, [max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)]) + out = out[:, :, max(-pad_y0, 0): out.shape[2] - max(-pad_y1, 0), max(-pad_x0, 0): out.shape[3] - max(-pad_x1, 0)] + + out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]) + w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w) + out = F.conv2d(out, w) + out = out.reshape(-1, minor, in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1) + return out[:, :, ::down_y, ::down_x] + +def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)): + return upfirdn2d_native(input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1]) + +# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/ops/fused_act/fused_act.py#L81 +class FusedLeakyReLU(torch.nn.Module): + def __init__(self, channel, negative_slope=0.2, scale=2 ** 0.5, dtype=None, device=None): + super().__init__() + self.bias = torch.nn.Parameter(torch.empty(1, channel, 1, 1, dtype=dtype, device=device)) + self.negative_slope = negative_slope + self.scale = scale + + def forward(self, input): + return fused_leaky_relu(input, comfy.model_management.cast_to(self.bias, device=input.device, dtype=input.dtype), self.negative_slope, self.scale) + +def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2 ** 0.5): + return F.leaky_relu(input + bias, negative_slope) * scale + +class Blur(torch.nn.Module): + def __init__(self, kernel, pad, dtype=None, device=None): + super().__init__() + kernel = torch.tensor(kernel, dtype=dtype, device=device) + kernel = kernel[None, :] * kernel[:, None] + kernel = kernel / kernel.sum() + self.register_buffer('kernel', kernel) + self.pad = pad + + def forward(self, input): + return upfirdn2d(input, comfy.model_management.cast_to(self.kernel, dtype=input.dtype, device=input.device), pad=self.pad) + +#https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L590 +class ScaledLeakyReLU(torch.nn.Module): + def __init__(self, negative_slope=0.2): + super().__init__() + self.negative_slope = negative_slope + + def forward(self, input): + return F.leaky_relu(input, negative_slope=self.negative_slope) + +# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L605 +class EqualConv2d(torch.nn.Module): + def __init__(self, in_channel, out_channel, kernel_size, stride=1, padding=0, bias=True, dtype=None, device=None, operations=None): + super().__init__() + self.weight = torch.nn.Parameter(torch.empty(out_channel, in_channel, kernel_size, kernel_size, device=device, dtype=dtype)) + self.scale = 1 / math.sqrt(in_channel * kernel_size ** 2) + self.stride = stride + self.padding = padding + self.bias = torch.nn.Parameter(torch.empty(out_channel, device=device, dtype=dtype)) if bias else None + + def forward(self, input): + if self.bias is None: + bias = None + else: + bias = comfy.model_management.cast_to(self.bias, device=input.device, dtype=input.dtype) + + return F.conv2d(input, comfy.model_management.cast_to(self.weight, device=input.device, dtype=input.dtype) * self.scale, bias=bias, stride=self.stride, padding=self.padding) + +# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L134 +class EqualLinear(torch.nn.Module): + def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1, activation=None, dtype=None, device=None, operations=None): + super().__init__() + self.weight = torch.nn.Parameter(torch.empty(out_dim, in_dim, device=device, dtype=dtype)) + self.bias = torch.nn.Parameter(torch.empty(out_dim, device=device, dtype=dtype)) if bias else None + self.activation = activation + self.scale = (1 / math.sqrt(in_dim)) * lr_mul + self.lr_mul = lr_mul + + def forward(self, input): + if self.bias is None: + bias = None + else: + bias = comfy.model_management.cast_to(self.bias, device=input.device, dtype=input.dtype) * self.lr_mul + + if self.activation: + out = F.linear(input, comfy.model_management.cast_to(self.weight, device=input.device, dtype=input.dtype) * self.scale) + return fused_leaky_relu(out, bias) + return F.linear(input, comfy.model_management.cast_to(self.weight, device=input.device, dtype=input.dtype) * self.scale, bias=bias) + +# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L654 +class ConvLayer(torch.nn.Sequential): + def __init__(self, in_channel, out_channel, kernel_size, downsample=False, blur_kernel=[1, 3, 3, 1], bias=True, activate=True, dtype=None, device=None, operations=None): + layers = [] + + if downsample: + factor = 2 + p = (len(blur_kernel) - factor) + (kernel_size - 1) + layers.append(Blur(blur_kernel, pad=((p + 1) // 2, p // 2))) + stride, padding = 2, 0 + else: + stride, padding = 1, kernel_size // 2 + + layers.append(EqualConv2d(in_channel, out_channel, kernel_size, padding=padding, stride=stride, bias=bias and not activate, dtype=dtype, device=device, operations=operations)) + + if activate: + layers.append(FusedLeakyReLU(out_channel) if bias else ScaledLeakyReLU(0.2)) + + super().__init__(*layers) + +# https://github.com/XPixelGroup/BasicSR/blob/8d56e3a045f9fb3e1d8872f92ee4a4f07f886b0a/basicsr/archs/stylegan2_arch.py#L704 +class ResBlock(torch.nn.Module): + def __init__(self, in_channel, out_channel, dtype=None, device=None, operations=None): + super().__init__() + self.conv1 = ConvLayer(in_channel, in_channel, 3, dtype=dtype, device=device, operations=operations) + self.conv2 = ConvLayer(in_channel, out_channel, 3, downsample=True, dtype=dtype, device=device, operations=operations) + self.skip = ConvLayer(in_channel, out_channel, 1, downsample=True, activate=False, bias=False, dtype=dtype, device=device, operations=operations) + + def forward(self, input): + out = self.conv2(self.conv1(input)) + skip = self.skip(input) + return (out + skip) / math.sqrt(2) + + +class EncoderApp(torch.nn.Module): + def __init__(self, w_dim=512, dtype=None, device=None, operations=None): + super().__init__() + kwargs = {"device": device, "dtype": dtype, "operations": operations} + + self.convs = torch.nn.ModuleList([ + ConvLayer(3, 32, 1, **kwargs), ResBlock(32, 64, **kwargs), + ResBlock(64, 128, **kwargs), ResBlock(128, 256, **kwargs), + ResBlock(256, 512, **kwargs), ResBlock(512, 512, **kwargs), + ResBlock(512, 512, **kwargs), ResBlock(512, 512, **kwargs), + EqualConv2d(512, w_dim, 4, padding=0, bias=False, **kwargs) + ]) + + def forward(self, x): + h = x + for conv in self.convs: + h = conv(h) + return h.squeeze(-1).squeeze(-1) + +class Encoder(torch.nn.Module): + def __init__(self, dim=512, motion_dim=20, dtype=None, device=None, operations=None): + super().__init__() + self.net_app = EncoderApp(dim, dtype=dtype, device=device, operations=operations) + self.fc = torch.nn.Sequential(*[EqualLinear(dim, dim, dtype=dtype, device=device, operations=operations) for _ in range(4)] + [EqualLinear(dim, motion_dim, dtype=dtype, device=device, operations=operations)]) + + def encode_motion(self, x): + return self.fc(self.net_app(x)) + +class Direction(torch.nn.Module): + def __init__(self, motion_dim, dtype=None, device=None, operations=None): + super().__init__() + self.weight = torch.nn.Parameter(torch.empty(512, motion_dim, device=device, dtype=dtype)) + self.motion_dim = motion_dim + + def forward(self, input): + stabilized_weight = comfy.model_management.cast_to(self.weight, device=input.device, dtype=input.dtype) + 1e-8 * torch.eye(512, self.motion_dim, device=input.device, dtype=input.dtype) + Q, _ = torch.linalg.qr(stabilized_weight.float()) + if input is None: + return Q + return torch.sum(input.unsqueeze(-1) * Q.T.to(input.dtype), dim=1) + +class Synthesis(torch.nn.Module): + def __init__(self, motion_dim, dtype=None, device=None, operations=None): + super().__init__() + self.direction = Direction(motion_dim, dtype=dtype, device=device, operations=operations) + +class Generator(torch.nn.Module): + def __init__(self, style_dim=512, motion_dim=20, dtype=None, device=None, operations=None): + super().__init__() + self.enc = Encoder(style_dim, motion_dim, dtype=dtype, device=device, operations=operations) + self.dec = Synthesis(motion_dim, dtype=dtype, device=device, operations=operations) + + def get_motion(self, img): + motion_feat = self.enc.encode_motion(img) + return self.dec.direction(motion_feat) + +class AnimateWanModel(WanModel): + r""" + Wan diffusion backbone supporting both text-to-video and image-to-video. + """ + + def __init__(self, + model_type='animate', + patch_size=(1, 2, 2), + text_len=512, + in_dim=16, + dim=2048, + ffn_dim=8192, + freq_dim=256, + text_dim=4096, + out_dim=16, + num_heads=16, + num_layers=32, + window_size=(-1, -1), + qk_norm=True, + cross_attn_norm=True, + eps=1e-6, + flf_pos_embed_token_number=None, + motion_encoder_dim=512, + image_model=None, + device=None, + dtype=None, + operations=None, + ): + + super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations) + + self.pose_patch_embedding = operations.Conv3d( + 16, dim, kernel_size=patch_size, stride=patch_size, device=device, dtype=dtype + ) + + self.motion_encoder = Generator(style_dim=512, motion_dim=20, device=device, dtype=dtype, operations=operations) + + self.face_adapter = FaceAdapter( + heads_num=self.num_heads, + hidden_dim=self.dim, + num_adapter_layers=self.num_layers // 5, + device=device, dtype=dtype, operations=operations + ) + + self.face_encoder = FaceEncoder( + in_dim=motion_encoder_dim, + hidden_dim=self.dim, + num_heads=4, + device=device, dtype=dtype, operations=operations + ) + + def after_patch_embedding(self, x, pose_latents, face_pixel_values): + if pose_latents is not None: + pose_latents = self.pose_patch_embedding(pose_latents) + x[:, :, 1:] += pose_latents + + if face_pixel_values is None: + return x, None + + b, c, T, h, w = face_pixel_values.shape + face_pixel_values = rearrange(face_pixel_values, "b c t h w -> (b t) c h w") + encode_bs = 8 + face_pixel_values_tmp = [] + for i in range(math.ceil(face_pixel_values.shape[0] / encode_bs)): + face_pixel_values_tmp.append(self.motion_encoder.get_motion(face_pixel_values[i * encode_bs: (i + 1) * encode_bs])) + + motion_vec = torch.cat(face_pixel_values_tmp) + + motion_vec = rearrange(motion_vec, "(b t) c -> b t c", t=T) + motion_vec = self.face_encoder(motion_vec) + + B, L, H, C = motion_vec.shape + pad_face = torch.zeros(B, 1, H, C).type_as(motion_vec) + motion_vec = torch.cat([pad_face, motion_vec], dim=1) + + if motion_vec.shape[1] < x.shape[2]: + B, L, H, C = motion_vec.shape + pad = torch.zeros(B, x.shape[2] - motion_vec.shape[1], H, C).type_as(motion_vec) + motion_vec = torch.cat([motion_vec, pad], dim=1) + else: + motion_vec = motion_vec[:, :x.shape[2]] + return x, motion_vec + + def forward_orig( + self, + x, + t, + context, + clip_fea=None, + pose_latents=None, + face_pixel_values=None, + freqs=None, + transformer_options={}, + **kwargs, + ): + # embeddings + x = self.patch_embedding(x.float()).to(x.dtype) + x, motion_vec = self.after_patch_embedding(x, pose_latents, face_pixel_values) + grid_sizes = x.shape[2:] + x = x.flatten(2).transpose(1, 2) + + # time embeddings + e = self.time_embedding( + sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype)) + e = e.reshape(t.shape[0], -1, e.shape[-1]) + e0 = self.time_projection(e).unflatten(2, (6, self.dim)) + + full_ref = None + if self.ref_conv is not None: + full_ref = kwargs.get("reference_latent", None) + if full_ref is not None: + full_ref = self.ref_conv(full_ref).flatten(2).transpose(1, 2) + x = torch.concat((full_ref, x), dim=1) + + # context + context = self.text_embedding(context) + + context_img_len = None + if clip_fea is not None: + if self.img_emb is not None: + context_clip = self.img_emb(clip_fea) # bs x 257 x dim + context = torch.concat([context_clip, context], dim=1) + context_img_len = clip_fea.shape[-2] + + patches_replace = transformer_options.get("patches_replace", {}) + blocks_replace = patches_replace.get("dit", {}) + for i, block in enumerate(self.blocks): + if ("double_block", i) in blocks_replace: + def block_wrap(args): + out = {} + out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"]) + return out + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap}) + x = out["img"] + else: + x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options) + + if i % 5 == 0 and motion_vec is not None: + x = x + self.face_adapter.fuser_blocks[i // 5](x, motion_vec) + + # head + x = self.head(x, e) + + if full_ref is not None: + x = x[:, full_ref.shape[1]:] + + # unpatchify + x = self.unpatchify(x, grid_sizes) + return x diff --git a/comfy/model_base.py b/comfy/model_base.py index 70b67b7c1..b0b9cde7d 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -39,6 +39,7 @@ import comfy.ldm.cosmos.model import comfy.ldm.cosmos.predict2 import comfy.ldm.lumina.model import comfy.ldm.wan.model +import comfy.ldm.wan.model_animate import comfy.ldm.hunyuan3d.model import comfy.ldm.hidream.model import comfy.ldm.chroma.model @@ -1253,6 +1254,23 @@ class WAN21_HuMo(WAN21): return out +class WAN22_Animate(WAN21): + def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): + super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model_animate.AnimateWanModel) + self.image_to_video = image_to_video + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + + face_video_pixels = kwargs.get("face_video_pixels", None) + if face_video_pixels is not None: + out['face_pixel_values'] = comfy.conds.CONDRegular(face_video_pixels) + + pose_latents = kwargs.get("pose_video_latent", None) + if pose_latents is not None: + out['pose_latents'] = comfy.conds.CONDRegular(self.process_latent_in(pose_latents)) + return out + class WAN22_S2V(WAN21): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel_S2V) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 72621bed6..46415c17a 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -404,6 +404,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["model_type"] = "s2v" elif '{}audio_proj.audio_proj_glob_1.layer.bias'.format(key_prefix) in state_dict_keys: dit_config["model_type"] = "humo" + elif '{}face_adapter.fuser_blocks.0.k_norm.weight'.format(key_prefix) in state_dict_keys: + dit_config["model_type"] = "animate" else: if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys: dit_config["model_type"] = "i2v" diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 213b5b92c..1fbb6aef4 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1096,6 +1096,19 @@ class WAN22_S2V(WAN21_T2V): out = model_base.WAN22_S2V(self, device=device) return out +class WAN22_Animate(WAN21_T2V): + unet_config = { + "image_model": "wan2.1", + "model_type": "animate", + } + + def __init__(self, unet_config): + super().__init__(unet_config) + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.WAN22_Animate(self, device=device) + return out + class WAN22_T2V(WAN21_T2V): unet_config = { "image_model": "wan2.1", @@ -1361,6 +1374,6 @@ class HunyuanImage21Refiner(HunyuanVideo): out = model_base.HunyuanImage21Refiner(self, device=device) return out -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 5f10edcff..4187a5619 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -1108,6 +1108,89 @@ class WanHuMoImageToVideo(io.ComfyNode): out_latent["samples"] = latent return io.NodeOutput(positive, negative, out_latent) +class WanAnimateToVideo(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanAnimateToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=77, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Image.Input("reference_image", optional=True), + io.Image.Input("face_video", optional=True), + io.Image.Input("pose_video", optional=True), + io.Int.Input("continue_motion_max_frames", default=5, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Image.Input("continue_motion", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + io.Int.Output(display_name="trim_latent"), + ], + is_experimental=True, + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, continue_motion_max_frames, reference_image=None, clip_vision_output=None, face_video=None, pose_video=None, continue_motion=None) -> io.NodeOutput: + latent_length = ((length - 1) // 4) + 1 + latent_width = width // 8 + latent_height = height // 8 + trim_latent = 0 + + if reference_image is None: + reference_image = torch.zeros((1, height, width, 3)) + + image = comfy.utils.common_upscale(reference_image[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1) + concat_latent_image = vae.encode(image[:, :, :, :3]) + mask = torch.zeros((1, 1, concat_latent_image.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=concat_latent_image.device, dtype=concat_latent_image.dtype) + trim_latent += concat_latent_image.shape[2] + + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + if face_video is not None: + face_video = comfy.utils.common_upscale(face_video[:length].movedim(-1, 1), 512, 512, "area", "center") * 2.0 - 1.0 + face_video = face_video.movedim(0, 1).unsqueeze(0) + positive = node_helpers.conditioning_set_values(positive, {"face_video_pixels": face_video}) + negative = node_helpers.conditioning_set_values(negative, {"face_video_pixels": face_video * 0.0 - 1.0}) + + if pose_video is not None: + pose_video = comfy.utils.common_upscale(pose_video[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1) + pose_video_latent = vae.encode(pose_video[:, :, :, :3]) + positive = node_helpers.conditioning_set_values(positive, {"pose_video_latent": pose_video_latent}) + negative = node_helpers.conditioning_set_values(negative, {"pose_video_latent": pose_video_latent}) + + if continue_motion is None: + image = torch.ones((length, height, width, 3)) * 0.5 + else: + continue_motion = continue_motion[-continue_motion_max_frames:] + continue_motion = comfy.utils.common_upscale(continue_motion[-length:].movedim(-1, 1), width, height, "area", "center").movedim(1, -1) + image = torch.ones((length, height, width, continue_motion.shape[-1]), device=continue_motion.device, dtype=continue_motion.dtype) * 0.5 + image[:continue_motion.shape[0]] = continue_motion + + concat_latent_image = torch.cat((concat_latent_image, vae.encode(image[:, :, :, :3])), dim=2) + mask_refmotion = torch.ones((1, 1, latent_length, concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=mask.device, dtype=mask.dtype) + if continue_motion is not None: + mask_refmotion[:, :, :((continue_motion.shape[0] - 1) // 4) + 1] = 0.0 + + mask = torch.cat((mask, mask_refmotion), dim=2) + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) + + latent = torch.zeros([batch_size, 16, latent_length + trim_latent, latent_height, latent_width], device=comfy.model_management.intermediate_device()) + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent, trim_latent) + class Wan22ImageToVideoLatent(io.ComfyNode): @classmethod def define_schema(cls): @@ -1169,6 +1252,7 @@ class WanExtension(ComfyExtension): WanSoundImageToVideo, WanSoundImageToVideoExtend, WanHuMoImageToVideo, + WanAnimateToVideo, Wan22ImageToVideoLatent, ] From 9fdf8c25abb2133803063a9be395cac774fce611 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 19 Sep 2025 23:02:43 +0300 Subject: [PATCH 301/759] api_nodes: reduce default timeout from 7 days to 2 hours (#9918) --- comfy_api_nodes/apis/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py index 4ad0b783b..0aed906fb 100644 --- a/comfy_api_nodes/apis/client.py +++ b/comfy_api_nodes/apis/client.py @@ -683,7 +683,7 @@ class SynchronousOperation(Generic[T, R]): auth_token: Optional[str] = None, comfy_api_key: Optional[str] = None, auth_kwargs: Optional[Dict[str, str]] = None, - timeout: float = 604800.0, + timeout: float = 7200.0, verify_ssl: bool = True, content_type: str = "application/json", multipart_parser: Callable | None = None, From 852704c81a652cc53fbe53c5f47dea0e50d0534e Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 19 Sep 2025 23:04:51 +0300 Subject: [PATCH 302/759] fix(seedream4): add flag to ignore error on partial success (#9952) --- comfy_api_nodes/nodes_bytedance.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index 369a3a4fe..a7eeaf15a 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -567,6 +567,12 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): tooltip="Whether to add an \"AI generated\" watermark to the image.", optional=True, ), + comfy_io.Boolean.Input( + "fail_on_partial", + default=True, + tooltip="If enabled, abort execution if any requested images are missing or return an error.", + optional=True, + ), ], outputs=[ comfy_io.Image.Output(), @@ -592,6 +598,7 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): max_images: int = 1, seed: int = 0, watermark: bool = True, + fail_on_partial: bool = True, ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) w = h = None @@ -651,9 +658,10 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): if len(response.data) == 1: return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) - return comfy_io.NodeOutput( - torch.cat([await download_url_to_image_tensor(str(i["url"])) for i in response.data]) - ) + urls = [str(d["url"]) for d in response.data if isinstance(d, dict) and "url" in d] + if fail_on_partial and len(urls) < len(response.data): + raise RuntimeError(f"Only {len(urls)} of {len(response.data)} images were generated before error.") + return comfy_io.NodeOutput(torch.cat([await download_url_to_image_tensor(i) for i in urls])) class ByteDanceTextToVideoNode(comfy_io.ComfyNode): @@ -1171,7 +1179,7 @@ async def process_video_task( payload: Union[Text2VideoTaskCreationRequest, Image2VideoTaskCreationRequest], auth_kwargs: dict, node_id: str, - estimated_duration: int | None, + estimated_duration: Optional[int], ) -> comfy_io.NodeOutput: initial_response = await SynchronousOperation( endpoint=ApiEndpoint( From e8df53b764c7dfce1a9235f6ee70a17cfdece3ff Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 19 Sep 2025 15:48:56 -0700 Subject: [PATCH 303/759] Update WanAnimateToVideo to more easily extend videos. (#9959) --- comfy/ldm/wan/model_animate.py | 2 +- comfy_extras/nodes_wan.py | 63 +++++++++++++++++++++++++--------- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/comfy/ldm/wan/model_animate.py b/comfy/ldm/wan/model_animate.py index 542f54110..7c87835d4 100644 --- a/comfy/ldm/wan/model_animate.py +++ b/comfy/ldm/wan/model_animate.py @@ -451,7 +451,7 @@ class AnimateWanModel(WanModel): def after_patch_embedding(self, x, pose_latents, face_pixel_values): if pose_latents is not None: pose_latents = self.pose_patch_embedding(pose_latents) - x[:, :, 1:] += pose_latents + x[:, :, 1:pose_latents.shape[2] + 1] += pose_latents[:, :, :x.shape[2] - 1] if face_pixel_values is None: return x, None diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 4187a5619..3e5fef535 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -1128,18 +1128,22 @@ class WanAnimateToVideo(io.ComfyNode): io.Image.Input("pose_video", optional=True), io.Int.Input("continue_motion_max_frames", default=5, min=1, max=nodes.MAX_RESOLUTION, step=4), io.Image.Input("continue_motion", optional=True), + io.Int.Input("video_frame_offset", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1, tooltip="The amount of frames to seek in all the input videos. Used for generating longer videos by chunk. Connect to the video_frame_offset output of the previous node for extending a video."), ], outputs=[ io.Conditioning.Output(display_name="positive"), io.Conditioning.Output(display_name="negative"), io.Latent.Output(display_name="latent"), io.Int.Output(display_name="trim_latent"), + io.Int.Output(display_name="trim_image"), + io.Int.Output(display_name="video_frame_offset"), ], is_experimental=True, ) @classmethod - def execute(cls, positive, negative, vae, width, height, length, batch_size, continue_motion_max_frames, reference_image=None, clip_vision_output=None, face_video=None, pose_video=None, continue_motion=None) -> io.NodeOutput: + def execute(cls, positive, negative, vae, width, height, length, batch_size, continue_motion_max_frames, video_frame_offset, reference_image=None, clip_vision_output=None, face_video=None, pose_video=None, continue_motion=None) -> io.NodeOutput: + trim_to_pose_video = False latent_length = ((length - 1) // 4) + 1 latent_width = width // 8 latent_height = height // 8 @@ -1152,35 +1156,60 @@ class WanAnimateToVideo(io.ComfyNode): concat_latent_image = vae.encode(image[:, :, :, :3]) mask = torch.zeros((1, 1, concat_latent_image.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=concat_latent_image.device, dtype=concat_latent_image.dtype) trim_latent += concat_latent_image.shape[2] + ref_motion_latent_length = 0 + + if continue_motion is None: + image = torch.ones((length, height, width, 3)) * 0.5 + else: + continue_motion = continue_motion[-continue_motion_max_frames:] + video_frame_offset -= continue_motion.shape[0] + video_frame_offset = max(0, video_frame_offset) + continue_motion = comfy.utils.common_upscale(continue_motion[-length:].movedim(-1, 1), width, height, "area", "center").movedim(1, -1) + image = torch.ones((length, height, width, continue_motion.shape[-1]), device=continue_motion.device, dtype=continue_motion.dtype) * 0.5 + image[:continue_motion.shape[0]] = continue_motion + ref_motion_latent_length += ((continue_motion.shape[0] - 1) // 4) + 1 if clip_vision_output is not None: positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + if pose_video is not None: + if pose_video.shape[0] <= video_frame_offset: + pose_video = None + else: + pose_video = pose_video[video_frame_offset:] + + if pose_video is not None: + pose_video = comfy.utils.common_upscale(pose_video[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1) + if not trim_to_pose_video: + if pose_video.shape[0] < length: + pose_video = torch.cat((pose_video,) + (pose_video[-1:],) * (length - pose_video.shape[0]), dim=0) + + pose_video_latent = vae.encode(pose_video[:, :, :, :3]) + positive = node_helpers.conditioning_set_values(positive, {"pose_video_latent": pose_video_latent}) + negative = node_helpers.conditioning_set_values(negative, {"pose_video_latent": pose_video_latent}) + + if trim_to_pose_video: + latent_length = pose_video_latent.shape[2] + length = latent_length * 4 - 3 + image = image[:length] + + if face_video is not None: + if face_video.shape[0] <= video_frame_offset: + face_video = None + else: + face_video = face_video[video_frame_offset:] + if face_video is not None: face_video = comfy.utils.common_upscale(face_video[:length].movedim(-1, 1), 512, 512, "area", "center") * 2.0 - 1.0 face_video = face_video.movedim(0, 1).unsqueeze(0) positive = node_helpers.conditioning_set_values(positive, {"face_video_pixels": face_video}) negative = node_helpers.conditioning_set_values(negative, {"face_video_pixels": face_video * 0.0 - 1.0}) - if pose_video is not None: - pose_video = comfy.utils.common_upscale(pose_video[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1) - pose_video_latent = vae.encode(pose_video[:, :, :, :3]) - positive = node_helpers.conditioning_set_values(positive, {"pose_video_latent": pose_video_latent}) - negative = node_helpers.conditioning_set_values(negative, {"pose_video_latent": pose_video_latent}) - - if continue_motion is None: - image = torch.ones((length, height, width, 3)) * 0.5 - else: - continue_motion = continue_motion[-continue_motion_max_frames:] - continue_motion = comfy.utils.common_upscale(continue_motion[-length:].movedim(-1, 1), width, height, "area", "center").movedim(1, -1) - image = torch.ones((length, height, width, continue_motion.shape[-1]), device=continue_motion.device, dtype=continue_motion.dtype) * 0.5 - image[:continue_motion.shape[0]] = continue_motion - concat_latent_image = torch.cat((concat_latent_image, vae.encode(image[:, :, :, :3])), dim=2) mask_refmotion = torch.ones((1, 1, latent_length, concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=mask.device, dtype=mask.dtype) if continue_motion is not None: - mask_refmotion[:, :, :((continue_motion.shape[0] - 1) // 4) + 1] = 0.0 + mask_refmotion[:, :, :ref_motion_latent_length] = 0.0 mask = torch.cat((mask, mask_refmotion), dim=2) positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) @@ -1189,7 +1218,7 @@ class WanAnimateToVideo(io.ComfyNode): latent = torch.zeros([batch_size, 16, latent_length + trim_latent, latent_height, latent_width], device=comfy.model_management.intermediate_device()) out_latent = {} out_latent["samples"] = latent - return io.NodeOutput(positive, negative, out_latent, trim_latent) + return io.NodeOutput(positive, negative, out_latent, trim_latent, max(0, ref_motion_latent_length * 4 - 3), video_frame_offset + length) class Wan22ImageToVideoLatent(io.ComfyNode): @classmethod From 66241cef31f21247ec8b450d699250fd83b3ff7c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 19 Sep 2025 23:24:10 -0700 Subject: [PATCH 304/759] Add inputs for character replacement to the WanAnimateToVideo node. (#9960) --- comfy_extras/nodes_wan.py | 40 +++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 3e5fef535..9cca6fb2e 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -1127,6 +1127,8 @@ class WanAnimateToVideo(io.ComfyNode): io.Image.Input("face_video", optional=True), io.Image.Input("pose_video", optional=True), io.Int.Input("continue_motion_max_frames", default=5, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Image.Input("background_video", optional=True), + io.Mask.Input("character_mask", optional=True), io.Image.Input("continue_motion", optional=True), io.Int.Input("video_frame_offset", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1, tooltip="The amount of frames to seek in all the input videos. Used for generating longer videos by chunk. Connect to the video_frame_offset output of the previous node for extending a video."), ], @@ -1142,7 +1144,7 @@ class WanAnimateToVideo(io.ComfyNode): ) @classmethod - def execute(cls, positive, negative, vae, width, height, length, batch_size, continue_motion_max_frames, video_frame_offset, reference_image=None, clip_vision_output=None, face_video=None, pose_video=None, continue_motion=None) -> io.NodeOutput: + def execute(cls, positive, negative, vae, width, height, length, batch_size, continue_motion_max_frames, video_frame_offset, reference_image=None, clip_vision_output=None, face_video=None, pose_video=None, continue_motion=None, background_video=None, character_mask=None) -> io.NodeOutput: trim_to_pose_video = False latent_length = ((length - 1) // 4) + 1 latent_width = width // 8 @@ -1154,7 +1156,7 @@ class WanAnimateToVideo(io.ComfyNode): image = comfy.utils.common_upscale(reference_image[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1) concat_latent_image = vae.encode(image[:, :, :, :3]) - mask = torch.zeros((1, 1, concat_latent_image.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=concat_latent_image.device, dtype=concat_latent_image.dtype) + mask = torch.zeros((1, 4, concat_latent_image.shape[-3], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=concat_latent_image.device, dtype=concat_latent_image.dtype) trim_latent += concat_latent_image.shape[2] ref_motion_latent_length = 0 @@ -1206,11 +1208,37 @@ class WanAnimateToVideo(io.ComfyNode): positive = node_helpers.conditioning_set_values(positive, {"face_video_pixels": face_video}) negative = node_helpers.conditioning_set_values(negative, {"face_video_pixels": face_video * 0.0 - 1.0}) - concat_latent_image = torch.cat((concat_latent_image, vae.encode(image[:, :, :, :3])), dim=2) - mask_refmotion = torch.ones((1, 1, latent_length, concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=mask.device, dtype=mask.dtype) - if continue_motion is not None: - mask_refmotion[:, :, :ref_motion_latent_length] = 0.0 + ref_images_num = max(0, ref_motion_latent_length * 4 - 3) + if background_video is not None: + if background_video.shape[0] > video_frame_offset: + background_video = background_video[video_frame_offset:] + background_video = comfy.utils.common_upscale(background_video[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1) + if background_video.shape[0] > ref_images_num: + image[ref_images_num:background_video.shape[0] - ref_images_num] = background_video[ref_images_num:] + mask_refmotion = torch.ones((1, 1, latent_length * 4, concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=mask.device, dtype=mask.dtype) + if continue_motion is not None: + mask_refmotion[:, :, :ref_motion_latent_length * 4] = 0.0 + + if character_mask is not None: + if character_mask.shape[0] > video_frame_offset or character_mask.shape[0] == 1: + if character_mask.shape[0] == 1: + character_mask = character_mask.repeat((length,) + (1,) * (character_mask.ndim - 1)) + else: + character_mask = character_mask[video_frame_offset:] + if character_mask.ndim == 3: + character_mask = character_mask.unsqueeze(1) + character_mask = character_mask.movedim(0, 1) + if character_mask.ndim == 4: + character_mask = character_mask.unsqueeze(1) + character_mask = comfy.utils.common_upscale(character_mask[:, :, :length], concat_latent_image.shape[-1], concat_latent_image.shape[-2], "nearest-exact", "center") + if character_mask.shape[2] > ref_images_num: + mask_refmotion[:, :, ref_images_num:character_mask.shape[2] + ref_images_num] = character_mask[:, :, ref_images_num:] + + concat_latent_image = torch.cat((concat_latent_image, vae.encode(image[:, :, :, :3])), dim=2) + + + mask_refmotion = mask_refmotion.view(1, mask_refmotion.shape[2] // 4, 4, mask_refmotion.shape[3], mask_refmotion.shape[4]).transpose(1, 2) mask = torch.cat((mask, mask_refmotion), dim=2) positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) From 9ed3c5cc09c55d2fffa67b59d9d21e3b44d7653e Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Sat, 20 Sep 2025 18:10:39 -0700 Subject: [PATCH 305/759] [Reviving #5709] Add strength input to Differential Diffusion (#9957) * Update nodes_differential_diffusion.py * Update nodes_differential_diffusion.py * Make strength optional to avoid validation errors when loading old workflows, adjust step --------- Co-authored-by: ThereforeGames --- comfy_extras/nodes_differential_diffusion.py | 33 +++++++++++++++----- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/comfy_extras/nodes_differential_diffusion.py b/comfy_extras/nodes_differential_diffusion.py index 98dbbf102..255ac420d 100644 --- a/comfy_extras/nodes_differential_diffusion.py +++ b/comfy_extras/nodes_differential_diffusion.py @@ -5,19 +5,30 @@ import torch class DifferentialDiffusion(): @classmethod def INPUT_TYPES(s): - return {"required": {"model": ("MODEL", ), - }} + return { + "required": { + "model": ("MODEL", ), + }, + "optional": { + "strength": ("FLOAT", { + "default": 1.0, + "min": 0.0, + "max": 1.0, + "step": 0.01, + }), + } + } RETURN_TYPES = ("MODEL",) FUNCTION = "apply" CATEGORY = "_for_testing" INIT = False - def apply(self, model): + def apply(self, model, strength=1.0): model = model.clone() - model.set_model_denoise_mask_function(self.forward) - return (model,) + model.set_model_denoise_mask_function(lambda *args, **kwargs: self.forward(*args, **kwargs, strength=strength)) + return (model, ) - def forward(self, sigma: torch.Tensor, denoise_mask: torch.Tensor, extra_options: dict): + def forward(self, sigma: torch.Tensor, denoise_mask: torch.Tensor, extra_options: dict, strength: float): model = extra_options["model"] step_sigmas = extra_options["sigmas"] sigma_to = model.inner_model.model_sampling.sigma_min @@ -31,7 +42,15 @@ class DifferentialDiffusion(): threshold = (current_ts - ts_to) / (ts_from - ts_to) - return (denoise_mask >= threshold).to(denoise_mask.dtype) + # Generate the binary mask based on the threshold + binary_mask = (denoise_mask >= threshold).to(denoise_mask.dtype) + + # Blend binary mask with the original denoise_mask using strength + if strength and strength < 1: + blended_mask = strength * binary_mask + (1 - strength) * denoise_mask + return blended_mask + else: + return binary_mask NODE_CLASS_MAPPINGS = { From 7be2b49b6b3430783555bc6bc8fcb3f46d5392e7 Mon Sep 17 00:00:00 2001 From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> Date: Sun, 21 Sep 2025 09:24:48 +0800 Subject: [PATCH 306/759] Fix LoRA Trainer bugs with FP8 models. (#9854) * Fix adapter weight init * Fix fp8 model training * Avoid inference tensor --- comfy/ops.py | 13 +++++++------ comfy/weight_adapter/loha.py | 8 ++++---- comfy/weight_adapter/lokr.py | 4 ++-- comfy/weight_adapter/lora.py | 4 ++-- comfy/weight_adapter/oft.py | 2 +- comfy_extras/nodes_train.py | 18 ++++++++++++++++++ 6 files changed, 34 insertions(+), 15 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 55e958adb..9d7dedd37 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -365,12 +365,13 @@ class fp8_ops(manual_cast): return None def forward_comfy_cast_weights(self, input): - try: - out = fp8_linear(self, input) - if out is not None: - return out - except Exception as e: - logging.info("Exception during fp8 op: {}".format(e)) + if not self.training: + try: + out = fp8_linear(self, input) + if out is not None: + return out + except Exception as e: + logging.info("Exception during fp8 op: {}".format(e)) weight, bias = cast_bias_weight(self, input) return torch.nn.functional.linear(input, weight, bias) diff --git a/comfy/weight_adapter/loha.py b/comfy/weight_adapter/loha.py index 55c97a3af..0abb2d403 100644 --- a/comfy/weight_adapter/loha.py +++ b/comfy/weight_adapter/loha.py @@ -130,12 +130,12 @@ class LoHaAdapter(WeightAdapterBase): def create_train(cls, weight, rank=1, alpha=1.0): out_dim = weight.shape[0] in_dim = weight.shape[1:].numel() - mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype) - mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype) + mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=torch.float32) + mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=torch.float32) torch.nn.init.normal_(mat1, 0.1) torch.nn.init.constant_(mat2, 0.0) - mat3 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype) - mat4 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype) + mat3 = torch.empty(out_dim, rank, device=weight.device, dtype=torch.float32) + mat4 = torch.empty(rank, in_dim, device=weight.device, dtype=torch.float32) torch.nn.init.normal_(mat3, 0.1) torch.nn.init.normal_(mat4, 0.01) return LohaDiff( diff --git a/comfy/weight_adapter/lokr.py b/comfy/weight_adapter/lokr.py index 563c835f5..9b2aff2d7 100644 --- a/comfy/weight_adapter/lokr.py +++ b/comfy/weight_adapter/lokr.py @@ -89,8 +89,8 @@ class LoKrAdapter(WeightAdapterBase): in_dim = weight.shape[1:].numel() out1, out2 = factorization(out_dim, rank) in1, in2 = factorization(in_dim, rank) - mat1 = torch.empty(out1, in1, device=weight.device, dtype=weight.dtype) - mat2 = torch.empty(out2, in2, device=weight.device, dtype=weight.dtype) + mat1 = torch.empty(out1, in1, device=weight.device, dtype=torch.float32) + mat2 = torch.empty(out2, in2, device=weight.device, dtype=torch.float32) torch.nn.init.kaiming_uniform_(mat2, a=5**0.5) torch.nn.init.constant_(mat1, 0.0) return LokrDiff( diff --git a/comfy/weight_adapter/lora.py b/comfy/weight_adapter/lora.py index 47aa17d13..4db004e50 100644 --- a/comfy/weight_adapter/lora.py +++ b/comfy/weight_adapter/lora.py @@ -66,8 +66,8 @@ class LoRAAdapter(WeightAdapterBase): def create_train(cls, weight, rank=1, alpha=1.0): out_dim = weight.shape[0] in_dim = weight.shape[1:].numel() - mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=weight.dtype) - mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=weight.dtype) + mat1 = torch.empty(out_dim, rank, device=weight.device, dtype=torch.float32) + mat2 = torch.empty(rank, in_dim, device=weight.device, dtype=torch.float32) torch.nn.init.kaiming_uniform_(mat1, a=5**0.5) torch.nn.init.constant_(mat2, 0.0) return LoraDiff( diff --git a/comfy/weight_adapter/oft.py b/comfy/weight_adapter/oft.py index 9d4982083..c0aab9635 100644 --- a/comfy/weight_adapter/oft.py +++ b/comfy/weight_adapter/oft.py @@ -68,7 +68,7 @@ class OFTAdapter(WeightAdapterBase): def create_train(cls, weight, rank=1, alpha=1.0): out_dim = weight.shape[0] block_size, block_num = factorization(out_dim, rank) - block = torch.zeros(block_num, block_size, block_size, device=weight.device, dtype=weight.dtype) + block = torch.zeros(block_num, block_size, block_size, device=weight.device, dtype=torch.float32) return OFTDiff( (block, None, alpha, None) ) diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index c3aaaee9b..9e6ec6780 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -38,6 +38,23 @@ def make_batch_extra_option_dict(d, indicies, full_size=None): return new_dict +def process_cond_list(d, prefix=""): + if hasattr(d, "__iter__") and not hasattr(d, "items"): + for index, item in enumerate(d): + process_cond_list(item, f"{prefix}.{index}") + return d + elif hasattr(d, "items"): + for k, v in list(d.items()): + if isinstance(v, dict): + process_cond_list(v, f"{prefix}.{k}") + elif isinstance(v, torch.Tensor): + d[k] = v.clone() + elif isinstance(v, (list, tuple)): + for index, item in enumerate(v): + process_cond_list(item, f"{prefix}.{k}.{index}") + return d + + class TrainSampler(comfy.samplers.Sampler): def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, grad_acc=1, total_steps=1, seed=0, training_dtype=torch.bfloat16): self.loss_fn = loss_fn @@ -50,6 +67,7 @@ class TrainSampler(comfy.samplers.Sampler): self.training_dtype = training_dtype def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False): + model_wrap.conds = process_cond_list(model_wrap.conds) cond = model_wrap.conds["positive"] dataset_size = sigmas.size(0) torch.cuda.empty_cache() From d1d9eb94b1096c9b3f963bf152bd6b9cd330c3a4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 20 Sep 2025 19:09:35 -0700 Subject: [PATCH 307/759] Lower wan memory estimation value a bit. (#9964) Previous pr reduced the peak memory requirement. --- comfy/supported_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 1fbb6aef4..4064bdae1 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -995,7 +995,7 @@ class WAN21_T2V(supported_models_base.BASE): unet_extra_config = {} latent_format = latent_formats.Wan21 - memory_usage_factor = 1.0 + memory_usage_factor = 0.9 supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32] @@ -1004,7 +1004,7 @@ class WAN21_T2V(supported_models_base.BASE): def __init__(self, unet_config): super().__init__(unet_config) - self.memory_usage_factor = self.unet_config.get("dim", 2000) / 2000 + self.memory_usage_factor = self.unet_config.get("dim", 2000) / 2222 def get_model(self, state_dict, prefix="", device=None): out = model_base.WAN21(self, device=device) From 27bc181c49249f11da2d8a14f84f3bdb58a0615f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 21 Sep 2025 16:48:31 -0700 Subject: [PATCH 308/759] Set some wan nodes as no longer experimental. (#9976) --- comfy_extras/nodes_wan.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 9cca6fb2e..b1e9babb5 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -287,7 +287,6 @@ class WanVaceToVideo(io.ComfyNode): return io.Schema( node_id="WanVaceToVideo", category="conditioning/video_models", - is_experimental=True, inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -375,7 +374,6 @@ class TrimVideoLatent(io.ComfyNode): return io.Schema( node_id="TrimVideoLatent", category="latent/video", - is_experimental=True, inputs=[ io.Latent.Input("samples"), io.Int.Input("trim_amount", default=0, min=0, max=99999), @@ -969,7 +967,6 @@ class WanSoundImageToVideo(io.ComfyNode): io.Conditioning.Output(display_name="negative"), io.Latent.Output(display_name="latent"), ], - is_experimental=True, ) @classmethod @@ -1000,7 +997,6 @@ class WanSoundImageToVideoExtend(io.ComfyNode): io.Conditioning.Output(display_name="negative"), io.Latent.Output(display_name="latent"), ], - is_experimental=True, ) @classmethod From 1fee8827cb8160c85d96c375413ac590311525dc Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 22 Sep 2025 13:49:48 -0700 Subject: [PATCH 309/759] Support for qwen edit plus model. Use the new TextEncodeQwenImageEditPlus. (#9986) --- comfy/text_encoders/llama.py | 16 +++++++---- comfy_extras/nodes_qwen.py | 55 ++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 5e11956b5..c5a48ba9f 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -400,21 +400,25 @@ class Qwen25_7BVLI(BaseLlama, torch.nn.Module): def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]): grid = None + position_ids = None + offset = 0 for e in embeds_info: if e.get("type") == "image": grid = e.get("extra", None) - position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device) start = e.get("index") - position_ids[:, :start] = torch.arange(0, start, device=embeds.device) + if position_ids is None: + position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device) + position_ids[:, :start] = torch.arange(0, start, device=embeds.device) end = e.get("size") + start len_max = int(grid.max()) // 2 start_next = len_max + start - position_ids[:, end:] = torch.arange(start_next, start_next + (embeds.shape[1] - end), device=embeds.device) - position_ids[0, start:end] = start + position_ids[:, end:] = torch.arange(start_next + offset, start_next + (embeds.shape[1] - end) + offset, device=embeds.device) + position_ids[0, start:end] = start + offset max_d = int(grid[0][1]) // 2 - position_ids[1, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start] + position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start] max_d = int(grid[0][2]) // 2 - position_ids[2, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start] + position_ids[2, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start] + offset += len_max - (end - start) if grid is None: position_ids = None diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py index fff89556f..49747dc7a 100644 --- a/comfy_extras/nodes_qwen.py +++ b/comfy_extras/nodes_qwen.py @@ -43,6 +43,61 @@ class TextEncodeQwenImageEdit: return (conditioning, ) +class TextEncodeQwenImageEditPlus: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "clip": ("CLIP", ), + "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), + }, + "optional": {"vae": ("VAE", ), + "image1": ("IMAGE", ), + "image2": ("IMAGE", ), + "image3": ("IMAGE", ), + }} + + RETURN_TYPES = ("CONDITIONING",) + FUNCTION = "encode" + + CATEGORY = "advanced/conditioning" + + def encode(self, clip, prompt, vae=None, image1=None, image2=None, image3=None): + ref_latents = [] + images = [image1, image2, image3] + images_vl = [] + llama_template = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + image_prompt = "" + + for i, image in enumerate(images): + if image is not None: + samples = image.movedim(-1, 1) + total = int(384 * 384) + + scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) + width = round(samples.shape[3] * scale_by) + height = round(samples.shape[2] * scale_by) + + s = comfy.utils.common_upscale(samples, width, height, "area", "disabled") + images_vl.append(s.movedim(1, -1)) + if vae is not None: + total = int(1024 * 1024) + scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) + width = round(samples.shape[3] * scale_by / 8.0) * 8 + height = round(samples.shape[2] * scale_by / 8.0) * 8 + + s = comfy.utils.common_upscale(samples, width, height, "area", "disabled") + ref_latents.append(vae.encode(s.movedim(1, -1)[:, :, :, :3])) + + image_prompt += "Picture {}: <|vision_start|><|image_pad|><|vision_end|>".format(i + 1) + + tokens = clip.tokenize(image_prompt + prompt, images=images_vl, llama_template=llama_template) + conditioning = clip.encode_from_tokens_scheduled(tokens) + if len(ref_latents) > 0: + conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": ref_latents}, append=True) + return (conditioning, ) + + NODE_CLASS_MAPPINGS = { "TextEncodeQwenImageEdit": TextEncodeQwenImageEdit, + "TextEncodeQwenImageEditPlus": TextEncodeQwenImageEditPlus, } From e3206351b07852f2127a56abd898ee77f7f4c25f Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Mon, 22 Sep 2025 14:12:32 -0700 Subject: [PATCH 310/759] add offset param (#9977) --- server.py | 9 ++- tests/execution/test_execution.py | 105 +++++++++++++++++++++++++++++- 2 files changed, 112 insertions(+), 2 deletions(-) diff --git a/server.py b/server.py index 43816a8cd..603677397 100644 --- a/server.py +++ b/server.py @@ -645,7 +645,14 @@ class PromptServer(): max_items = request.rel_url.query.get("max_items", None) if max_items is not None: max_items = int(max_items) - return web.json_response(self.prompt_queue.get_history(max_items=max_items)) + + offset = request.rel_url.query.get("offset", None) + if offset is not None: + offset = int(offset) + else: + offset = -1 + + return web.json_response(self.prompt_queue.get_history(max_items=max_items, offset=offset)) @routes.get("/history/{prompt_id}") async def get_history_prompt_id(request): diff --git a/tests/execution/test_execution.py b/tests/execution/test_execution.py index 8ea05fdd8..ef73ad9fd 100644 --- a/tests/execution/test_execution.py +++ b/tests/execution/test_execution.py @@ -84,6 +84,21 @@ class ComfyClient: with urllib.request.urlopen("http://{}/history/{}".format(self.server_address, prompt_id)) as response: return json.loads(response.read()) + def get_all_history(self, max_items=None, offset=None): + url = "http://{}/history".format(self.server_address) + params = {} + if max_items is not None: + params["max_items"] = max_items + if offset is not None: + params["offset"] = offset + + if params: + url_values = urllib.parse.urlencode(params) + url = "{}?{}".format(url, url_values) + + with urllib.request.urlopen(url) as response: + return json.loads(response.read()) + def set_test_name(self, name): self.test_name = name @@ -498,7 +513,6 @@ class TestExecution: assert len(images1) == 1, "Should have 1 image" assert len(images2) == 1, "Should have 1 image" - # This tests that only constant outputs are used in the call to `IS_CHANGED` def test_is_changed_with_outputs(self, client: ComfyClient, builder: GraphBuilder): g = builder @@ -762,3 +776,92 @@ class TestExecution: except urllib.error.HTTPError: pass # Expected behavior + def _create_history_item(self, client, builder): + g = GraphBuilder(prefix="offset_test") + input_node = g.node( + "StubImage", content="BLACK", height=32, width=32, batch_size=1 + ) + g.node("SaveImage", images=input_node.out(0)) + return client.run(g) + + def test_offset_returns_different_items_than_beginning_of_history( + self, client: ComfyClient, builder: GraphBuilder + ): + """Test that offset skips items at the beginning""" + for _ in range(5): + self._create_history_item(client, builder) + + first_two = client.get_all_history(max_items=2, offset=0) + next_two = client.get_all_history(max_items=2, offset=2) + + assert set(first_two.keys()).isdisjoint( + set(next_two.keys()) + ), "Offset should skip initial items" + + def test_offset_beyond_history_length_returns_empty( + self, client: ComfyClient, builder: GraphBuilder + ): + """Test offset larger than total history returns empty result""" + self._create_history_item(client, builder) + + result = client.get_all_history(offset=100) + assert len(result) == 0, "Large offset should return no items" + + def test_offset_at_exact_history_length_returns_empty( + self, client: ComfyClient, builder: GraphBuilder + ): + """Test offset equal to history length returns empty""" + for _ in range(3): + self._create_history_item(client, builder) + + all_history = client.get_all_history() + result = client.get_all_history(offset=len(all_history)) + assert len(result) == 0, "Offset at history length should return empty" + + def test_offset_zero_equals_no_offset_parameter( + self, client: ComfyClient, builder: GraphBuilder + ): + """Test offset=0 behaves same as omitting offset""" + self._create_history_item(client, builder) + + with_zero = client.get_all_history(offset=0) + without_offset = client.get_all_history() + + assert with_zero == without_offset, "offset=0 should equal no offset" + + def test_offset_without_max_items_skips_from_beginning( + self, client: ComfyClient, builder: GraphBuilder + ): + """Test offset alone (no max_items) returns remaining items""" + for _ in range(4): + self._create_history_item(client, builder) + + all_items = client.get_all_history() + offset_items = client.get_all_history(offset=2) + + assert ( + len(offset_items) == len(all_items) - 2 + ), "Offset should skip specified number of items" + + def test_offset_with_max_items_returns_correct_window( + self, client: ComfyClient, builder: GraphBuilder + ): + """Test offset + max_items returns correct slice of history""" + for _ in range(6): + self._create_history_item(client, builder) + + window = client.get_all_history(max_items=2, offset=1) + assert len(window) <= 2, "Should respect max_items limit" + + def test_offset_near_end_returns_remaining_items_only( + self, client: ComfyClient, builder: GraphBuilder + ): + """Test offset near end of history returns only remaining items""" + for _ in range(3): + self._create_history_item(client, builder) + + all_history = client.get_all_history() + # Offset to near the end + result = client.get_all_history(max_items=5, offset=len(all_history) - 1) + + assert len(result) <= 1, "Should return at most 1 item when offset is near end" From 8a5ac527e60fcd48ec228d309d49ab28ac79def8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 22 Sep 2025 14:26:58 -0700 Subject: [PATCH 311/759] Fix bug with WanAnimateToVideo node. (#9988) --- comfy_extras/nodes_wan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index b1e9babb5..6c16a2673 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -1210,7 +1210,7 @@ class WanAnimateToVideo(io.ComfyNode): background_video = background_video[video_frame_offset:] background_video = comfy.utils.common_upscale(background_video[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1) if background_video.shape[0] > ref_images_num: - image[ref_images_num:background_video.shape[0] - ref_images_num] = background_video[ref_images_num:] + image[ref_images_num:background_video.shape[0]] = background_video[ref_images_num:] mask_refmotion = torch.ones((1, 1, latent_length * 4, concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=mask.device, dtype=mask.dtype) if continue_motion is not None: From 707b2638ecd82360c0a67e1d86cc4fdeae218d03 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 22 Sep 2025 14:34:33 -0700 Subject: [PATCH 312/759] Fix bug with WanAnimateToVideo. (#9990) --- comfy_extras/nodes_wan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index 6c16a2673..b0bd471bf 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -1229,7 +1229,7 @@ class WanAnimateToVideo(io.ComfyNode): character_mask = character_mask.unsqueeze(1) character_mask = comfy.utils.common_upscale(character_mask[:, :, :length], concat_latent_image.shape[-1], concat_latent_image.shape[-2], "nearest-exact", "center") if character_mask.shape[2] > ref_images_num: - mask_refmotion[:, :, ref_images_num:character_mask.shape[2] + ref_images_num] = character_mask[:, :, ref_images_num:] + mask_refmotion[:, :, ref_images_num:character_mask.shape[2]] = character_mask[:, :, ref_images_num:] concat_latent_image = torch.cat((concat_latent_image, vae.encode(image[:, :, :, :3])), dim=2) From 145b0e4f79b5d9e815bb781ba29ccd057bb52dab Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 23 Sep 2025 23:22:35 +0800 Subject: [PATCH 313/759] update template to 0.1.86 (#9998) * update template to 0.1.84 * update template to 0.1.85 * Update template to 0.1.86 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 79187efaa..2980bebdd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.26.13 -comfyui-workflow-templates==0.1.81 +comfyui-workflow-templates==0.1.86 comfyui-embedded-docs==0.2.6 torch torchsde From e8087907995497c6971ee64bd5fa02cb49c1eda6 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 23 Sep 2025 18:36:47 +0300 Subject: [PATCH 314/759] feat(api-nodes): add wan t2i, t2v, i2v nodes (#9996) --- comfy_api_nodes/nodes_wan.py | 602 +++++++++++++++++++++++++++++++++++ nodes.py | 1 + 2 files changed, 603 insertions(+) create mode 100644 comfy_api_nodes/nodes_wan.py diff --git a/comfy_api_nodes/nodes_wan.py b/comfy_api_nodes/nodes_wan.py new file mode 100644 index 000000000..db5bd41c1 --- /dev/null +++ b/comfy_api_nodes/nodes_wan.py @@ -0,0 +1,602 @@ +import re +from typing import Optional, Type, Union +from typing_extensions import override + +import torch +from pydantic import BaseModel, Field +from comfy_api.latest import ComfyExtension, Input, io as comfy_io +from comfy_api_nodes.apis.client import ( + ApiEndpoint, + HttpMethod, + SynchronousOperation, + PollingOperation, + EmptyRequest, + R, + T, +) +from comfy_api_nodes.util.validation_utils import get_number_of_images, validate_audio_duration + +from comfy_api_nodes.apinode_utils import ( + download_url_to_image_tensor, + download_url_to_video_output, + tensor_to_base64_string, + audio_to_base64_string, +) + +class Text2ImageInputField(BaseModel): + prompt: str = Field(...) + negative_prompt: Optional[str] = Field(None) + + +class Text2VideoInputField(BaseModel): + prompt: str = Field(...) + negative_prompt: Optional[str] = Field(None) + audio_url: Optional[str] = Field(None) + + +class Image2VideoInputField(BaseModel): + prompt: str = Field(...) + negative_prompt: Optional[str] = Field(None) + img_url: str = Field(...) + audio_url: Optional[str] = Field(None) + + +class Txt2ImageParametersField(BaseModel): + size: str = Field(...) + n: int = Field(1, description="Number of images to generate.") # we support only value=1 + seed: int = Field(..., ge=0, le=2147483647) + prompt_extend: bool = Field(True) + watermark: bool = Field(True) + + +class Text2VideoParametersField(BaseModel): + size: str = Field(...) + seed: int = Field(..., ge=0, le=2147483647) + duration: int = Field(5, ge=5, le=10) + prompt_extend: bool = Field(True) + watermark: bool = Field(True) + audio: bool = Field(False, description="Should be audio generated automatically") + + +class Image2VideoParametersField(BaseModel): + resolution: str = Field(...) + seed: int = Field(..., ge=0, le=2147483647) + duration: int = Field(5, ge=5, le=10) + prompt_extend: bool = Field(True) + watermark: bool = Field(True) + audio: bool = Field(False, description="Should be audio generated automatically") + + +class Text2ImageTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Text2ImageInputField = Field(...) + parameters: Txt2ImageParametersField = Field(...) + + +class Text2VideoTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Text2VideoInputField = Field(...) + parameters: Text2VideoParametersField = Field(...) + + +class Image2VideoTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Image2VideoInputField = Field(...) + parameters: Image2VideoParametersField = Field(...) + + +class TaskCreationOutputField(BaseModel): + task_id: str = Field(...) + task_status: str = Field(...) + + +class TaskCreationResponse(BaseModel): + output: Optional[TaskCreationOutputField] = Field(None) + request_id: str = Field(...) + code: Optional[str] = Field(None, description="The error code of the failed request.") + message: Optional[str] = Field(None, description="Details of the failed request.") + + +class TaskResult(BaseModel): + url: Optional[str] = Field(None) + code: Optional[str] = Field(None) + message: Optional[str] = Field(None) + + +class ImageTaskStatusOutputField(TaskCreationOutputField): + task_id: str = Field(...) + task_status: str = Field(...) + results: Optional[list[TaskResult]] = Field(None) + + +class VideoTaskStatusOutputField(TaskCreationOutputField): + task_id: str = Field(...) + task_status: str = Field(...) + video_url: Optional[str] = Field(None) + code: Optional[str] = Field(None) + message: Optional[str] = Field(None) + + +class ImageTaskStatusResponse(BaseModel): + output: Optional[ImageTaskStatusOutputField] = Field(None) + request_id: str = Field(...) + + +class VideoTaskStatusResponse(BaseModel): + output: Optional[VideoTaskStatusOutputField] = Field(None) + request_id: str = Field(...) + + +RES_IN_PARENS = re.compile(r'\((\d+)\s*[x×]\s*(\d+)\)') + + +async def process_task( + auth_kwargs: dict[str, str], + url: str, + request_model: Type[T], + response_model: Type[R], + payload: Union[Text2ImageTaskCreationRequest, Text2VideoTaskCreationRequest, Image2VideoTaskCreationRequest], + node_id: str, + estimated_duration: int, + poll_interval: int, +) -> Type[R]: + initial_response = await SynchronousOperation( + endpoint=ApiEndpoint( + path=url, + method=HttpMethod.POST, + request_model=request_model, + response_model=TaskCreationResponse, + ), + request=payload, + auth_kwargs=auth_kwargs, + ).execute() + + if not initial_response.output: + raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + + return await PollingOperation( + poll_endpoint=ApiEndpoint( + path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=response_model, + ), + completed_statuses=["SUCCEEDED"], + failed_statuses=["FAILED", "CANCELED", "UNKNOWN"], + status_extractor=lambda x: x.output.task_status, + estimated_duration=estimated_duration, + poll_interval=poll_interval, + node_id=node_id, + auth_kwargs=auth_kwargs, + ).execute() + + +class WanTextToImageApi(comfy_io.ComfyNode): + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="WanTextToImageApi", + display_name="Wan Text to Image", + category="api node/image/Wan", + description="Generates image based on text prompt.", + inputs=[ + comfy_io.Combo.Input( + "model", + options=["wan2.5-t2i-preview"], + default="wan2.5-t2i-preview", + tooltip="Model to use.", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", + ), + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative text prompt to guide what to avoid.", + optional=True, + ), + comfy_io.Int.Input( + "width", + default=1024, + min=768, + max=1440, + step=32, + optional=True, + ), + comfy_io.Int.Input( + "height", + default=1024, + min=768, + max=1440, + step=32, + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + optional=True, + ), + comfy_io.Boolean.Input( + "prompt_extend", + default=True, + tooltip="Whether to enhance the prompt with AI assistance.", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the result.", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + negative_prompt: str = "", + width: int = 1024, + height: int = 1024, + seed: int = 0, + prompt_extend: bool = True, + watermark: bool = True, + ): + payload = Text2ImageTaskCreationRequest( + model=model, + input=Text2ImageInputField(prompt=prompt, negative_prompt=negative_prompt), + parameters=Txt2ImageParametersField( + size=f"{width}*{height}", + seed=seed, + prompt_extend=prompt_extend, + watermark=watermark, + ), + ) + response = await process_task( + { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + "/proxy/wan/api/v1/services/aigc/text2image/image-synthesis", + request_model=Text2ImageTaskCreationRequest, + response_model=ImageTaskStatusResponse, + payload=payload, + node_id=cls.hidden.unique_id, + estimated_duration=9, + poll_interval=3, + ) + return comfy_io.NodeOutput(await download_url_to_image_tensor(str(response.output.results[0].url))) + + +class WanTextToVideoApi(comfy_io.ComfyNode): + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="WanTextToVideoApi", + display_name="Wan Text to Video", + category="api node/video/Wan", + description="Generates video based on text prompt.", + inputs=[ + comfy_io.Combo.Input( + "model", + options=["wan2.5-t2v-preview"], + default="wan2.5-t2v-preview", + tooltip="Model to use.", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", + ), + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative text prompt to guide what to avoid.", + optional=True, + ), + comfy_io.Combo.Input( + "size", + options=[ + "480p: 1:1 (624x624)", + "480p: 16:9 (832x480)", + "480p: 9:16 (480x832)", + "720p: 1:1 (960x960)", + "720p: 16:9 (1280x720)", + "720p: 9:16 (720x1280)", + "720p: 4:3 (1088x832)", + "720p: 3:4 (832x1088)", + "1080p: 1:1 (1440x1440)", + "1080p: 16:9 (1920x1080)", + "1080p: 9:16 (1080x1920)", + "1080p: 4:3 (1632x1248)", + "1080p: 3:4 (1248x1632)", + ], + default="480p: 1:1 (624x624)", + optional=True, + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=10, + step=5, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Available durations: 5 and 10 seconds", + optional=True, + ), + comfy_io.Audio.Input( + "audio", + optional=True, + tooltip="Audio must contain a clear, loud voice, without extraneous noise, background music.", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + optional=True, + ), + comfy_io.Boolean.Input( + "generate_audio", + default=False, + optional=True, + tooltip="If there is no audio input, generate audio automatically.", + ), + comfy_io.Boolean.Input( + "prompt_extend", + default=True, + tooltip="Whether to enhance the prompt with AI assistance.", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the result.", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + negative_prompt: str = "", + size: str = "480p: 1:1 (624x624)", + duration: int = 5, + audio: Optional[Input.Audio] = None, + seed: int = 0, + generate_audio: bool = False, + prompt_extend: bool = True, + watermark: bool = True, + ): + width, height = RES_IN_PARENS.search(size).groups() + audio_url = None + if audio is not None: + validate_audio_duration(audio, 3.0, 29.0) + audio_url = "data:audio/mp3;base64," + audio_to_base64_string(audio, "mp3", "libmp3lame") + payload = Text2VideoTaskCreationRequest( + model=model, + input=Text2VideoInputField(prompt=prompt, negative_prompt=negative_prompt, audio_url=audio_url), + parameters=Text2VideoParametersField( + size=f"{width}*{height}", + duration=duration, + seed=seed, + audio=generate_audio, + prompt_extend=prompt_extend, + watermark=watermark, + ), + ) + response = await process_task( + { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + "/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", + request_model=Text2VideoTaskCreationRequest, + response_model=VideoTaskStatusResponse, + payload=payload, + node_id=cls.hidden.unique_id, + estimated_duration=120 * int(duration / 5), + poll_interval=6, + ) + return comfy_io.NodeOutput(await download_url_to_video_output(response.output.video_url)) + + +class WanImageToVideoApi(comfy_io.ComfyNode): + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="WanImageToVideoApi", + display_name="Wan Image to Video", + category="api node/video/Wan", + description="Generates video based on the first frame and text prompt.", + inputs=[ + comfy_io.Combo.Input( + "model", + options=["wan2.5-i2v-preview"], + default="wan2.5-i2v-preview", + tooltip="Model to use.", + ), + comfy_io.Image.Input( + "image", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", + ), + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative text prompt to guide what to avoid.", + optional=True, + ), + comfy_io.Combo.Input( + "resolution", + options=[ + "480P", + "720P", + "1080P", + ], + default="480P", + optional=True, + ), + comfy_io.Int.Input( + "duration", + default=5, + min=5, + max=10, + step=5, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Available durations: 5 and 10 seconds", + optional=True, + ), + comfy_io.Audio.Input( + "audio", + optional=True, + tooltip="Audio must contain a clear, loud voice, without extraneous noise, background music.", + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + optional=True, + ), + comfy_io.Boolean.Input( + "generate_audio", + default=False, + optional=True, + tooltip="If there is no audio input, generate audio automatically.", + ), + comfy_io.Boolean.Input( + "prompt_extend", + default=True, + tooltip="Whether to enhance the prompt with AI assistance.", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the result.", + optional=True, + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + image: torch.Tensor, + prompt: str, + negative_prompt: str = "", + resolution: str = "480P", + duration: int = 5, + audio: Optional[Input.Audio] = None, + seed: int = 0, + generate_audio: bool = False, + prompt_extend: bool = True, + watermark: bool = True, + ): + if get_number_of_images(image) != 1: + raise ValueError("Exactly one input image is required.") + image_url = "data:image/png;base64," + tensor_to_base64_string(image, total_pixels=2000*2000) + audio_url = None + if audio is not None: + validate_audio_duration(audio, 3.0, 29.0) + audio_url = "data:audio/mp3;base64," + audio_to_base64_string(audio, "mp3", "libmp3lame") + payload = Image2VideoTaskCreationRequest( + model=model, + input=Image2VideoInputField( + prompt=prompt, negative_prompt=negative_prompt, img_url=image_url, audio_url=audio_url + ), + parameters=Image2VideoParametersField( + resolution=resolution, + duration=duration, + seed=seed, + audio=generate_audio, + prompt_extend=prompt_extend, + watermark=watermark, + ), + ) + response = await process_task( + { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + "/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", + request_model=Image2VideoTaskCreationRequest, + response_model=VideoTaskStatusResponse, + payload=payload, + node_id=cls.hidden.unique_id, + estimated_duration=120 * int(duration / 5), + poll_interval=6, + ) + return comfy_io.NodeOutput(await download_url_to_video_output(response.output.video_url)) + + +class WanApiExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + WanTextToImageApi, + WanTextToVideoApi, + WanImageToVideoApi, + ] + + +async def comfy_entrypoint() -> WanApiExtension: + return WanApiExtension() diff --git a/nodes.py b/nodes.py index 5a5fdcb8e..1a6784b68 100644 --- a/nodes.py +++ b/nodes.py @@ -2361,6 +2361,7 @@ async def init_builtin_api_nodes(): "nodes_rodin.py", "nodes_gemini.py", "nodes_vidu.py", + "nodes_wan.py", ] if not await load_custom_node(os.path.join(api_nodes_dir, "canary.py"), module_parent="comfy_api_nodes"): From b8730510db30c8858e1e5d8e126ef19eac395560 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 23 Sep 2025 11:50:33 -0400 Subject: [PATCH 315/759] ComfyUI version 0.3.60 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index ee58205f5..d469a8194 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.59" +__version__ = "0.3.60" diff --git a/pyproject.toml b/pyproject.toml index a7fc1a5a6..7340c320b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.59" +version = "0.3.60" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 341b4adefd308cbcf82c07effc255f2770b3b3e2 Mon Sep 17 00:00:00 2001 From: Changrz <51637999+WhiteGiven@users.noreply.github.com> Date: Thu, 25 Sep 2025 02:05:37 +0800 Subject: [PATCH 316/759] Rodin3D - add [Rodin3D Gen-2 generate] api-node (#9994) * update Rodin api node * update rodin3d gen2 api node * fix images limited bug --- comfy_api_nodes/apis/rodin_api.py | 3 +- comfy_api_nodes/nodes_rodin.py | 140 ++++++++++++++++++++++++------ 2 files changed, 117 insertions(+), 26 deletions(-) diff --git a/comfy_api_nodes/apis/rodin_api.py b/comfy_api_nodes/apis/rodin_api.py index b0cf171fa..02cf42c29 100644 --- a/comfy_api_nodes/apis/rodin_api.py +++ b/comfy_api_nodes/apis/rodin_api.py @@ -9,8 +9,9 @@ class Rodin3DGenerateRequest(BaseModel): seed: int = Field(..., description="seed_") tier: str = Field(..., description="Tier of generation.") material: str = Field(..., description="The material type.") - quality: str = Field(..., description="The generation quality of the mesh.") + quality_override: int = Field(..., description="The poly count of the mesh.") mesh_mode: str = Field(..., description="It controls the type of faces of generated models.") + TAPose: Optional[bool] = Field(None, description="") class GenerateJobsData(BaseModel): uuids: List[str] = Field(..., description="str LIST") diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index c89d087e5..1af393eba 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -121,10 +121,10 @@ class Rodin3DAPI: else: return "Generating" - async def create_generate_task(self, images=None, seed=1, material="PBR", quality="medium", tier="Regular", mesh_mode="Quad", **kwargs): + async def create_generate_task(self, images=None, seed=1, material="PBR", quality_override=18000, tier="Regular", mesh_mode="Quad", TAPose = False, **kwargs): if images is None: raise Exception("Rodin 3D generate requires at least 1 image.") - if len(images) >= 5: + if len(images) > 5: raise Exception("Rodin 3D generate requires up to 5 image.") path = "/proxy/rodin/api/v2/rodin" @@ -139,8 +139,9 @@ class Rodin3DAPI: seed=seed, tier=tier, material=material, - quality=quality, - mesh_mode=mesh_mode + quality_override=quality_override, + mesh_mode=mesh_mode, + TAPose=TAPose, ), files=[ ( @@ -211,23 +212,36 @@ class Rodin3DAPI: return await operation.execute() def get_quality_mode(self, poly_count): - if poly_count == "200K-Triangle": + polycount = poly_count.split("-") + poly = polycount[1] + count = polycount[0] + if poly == "Triangle": mesh_mode = "Raw" - quality = "medium" + elif poly == "Quad": + mesh_mode = "Quad" else: mesh_mode = "Quad" - if poly_count == "4K-Quad": - quality = "extra-low" - elif poly_count == "8K-Quad": - quality = "low" - elif poly_count == "18K-Quad": - quality = "medium" - elif poly_count == "50K-Quad": - quality = "high" - else: - quality = "medium" - return mesh_mode, quality + if count == "4K": + quality_override = 4000 + elif count == "8K": + quality_override = 8000 + elif count == "18K": + quality_override = 18000 + elif count == "50K": + quality_override = 50000 + elif count == "2K": + quality_override = 2000 + elif count == "20K": + quality_override = 20000 + elif count == "150K": + quality_override = 150000 + elif count == "500K": + quality_override = 500000 + else: + quality_override = 18000 + + return mesh_mode, quality_override async def download_files(self, url_list): save_path = os.path.join(comfy_paths.get_output_directory(), "Rodin3D", datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) @@ -300,9 +314,9 @@ class Rodin3D_Regular(Rodin3DAPI): m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality = self.get_quality_mode(Polygon_count) + mesh_mode, quality_override = self.get_quality_mode(Polygon_count) task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, - quality=quality, tier=tier, mesh_mode=mesh_mode, + quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, **kwargs) await self.poll_for_task_status(subscription_key, **kwargs) download_list = await self.get_rodin_download_list(task_uuid, **kwargs) @@ -346,9 +360,9 @@ class Rodin3D_Detail(Rodin3DAPI): m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality = self.get_quality_mode(Polygon_count) + mesh_mode, quality_override = self.get_quality_mode(Polygon_count) task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, - quality=quality, tier=tier, mesh_mode=mesh_mode, + quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, **kwargs) await self.poll_for_task_status(subscription_key, **kwargs) download_list = await self.get_rodin_download_list(task_uuid, **kwargs) @@ -392,9 +406,9 @@ class Rodin3D_Smooth(Rodin3DAPI): m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality = self.get_quality_mode(Polygon_count) + mesh_mode, quality_override = self.get_quality_mode(Polygon_count) task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, - quality=quality, tier=tier, mesh_mode=mesh_mode, + quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, **kwargs) await self.poll_for_task_status(subscription_key, **kwargs) download_list = await self.get_rodin_download_list(task_uuid, **kwargs) @@ -446,10 +460,10 @@ class Rodin3D_Sketch(Rodin3DAPI): for i in range(num_images): m_images.append(Images[i]) material_type = "PBR" - quality = "medium" + quality_override = 18000 mesh_mode = "Quad" task_uuid, subscription_key = await self.create_generate_task( - images=m_images, seed=Seed, material=material_type, quality=quality, tier=tier, mesh_mode=mesh_mode, **kwargs + images=m_images, seed=Seed, material=material_type, quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, **kwargs ) await self.poll_for_task_status(subscription_key, **kwargs) download_list = await self.get_rodin_download_list(task_uuid, **kwargs) @@ -457,6 +471,80 @@ class Rodin3D_Sketch(Rodin3DAPI): return (model,) +class Rodin3D_Gen2(Rodin3DAPI): + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "Images": + ( + IO.IMAGE, + { + "forceInput":True, + } + ) + }, + "optional": { + "Seed": ( + IO.INT, + { + "default":0, + "min":0, + "max":65535, + "display":"number" + } + ), + "Material_Type": ( + IO.COMBO, + { + "options": ["PBR", "Shaded"], + "default": "PBR" + } + ), + "Polygon_count": ( + IO.COMBO, + { + "options": ["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "2K-Triangle", "20K-Triangle", "150K-Triangle", "500K-Triangle"], + "default": "500K-Triangle" + } + ), + "TAPose": ( + IO.BOOLEAN, + { + "default": False, + } + ) + }, + "hidden": { + "auth_token": "AUTH_TOKEN_COMFY_ORG", + "comfy_api_key": "API_KEY_COMFY_ORG", + }, + } + + async def api_call( + self, + Images, + Seed, + Material_Type, + Polygon_count, + TAPose, + **kwargs + ): + tier = "Gen-2" + num_images = Images.shape[0] + m_images = [] + for i in range(num_images): + m_images.append(Images[i]) + mesh_mode, quality_override = self.get_quality_mode(Polygon_count) + task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, + quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, TAPose=TAPose, + **kwargs) + await self.poll_for_task_status(subscription_key, **kwargs) + download_list = await self.get_rodin_download_list(task_uuid, **kwargs) + model = await self.download_files(download_list) + + return (model,) + # A dictionary that contains all nodes you want to export with their names # NOTE: names should be globally unique NODE_CLASS_MAPPINGS = { @@ -464,6 +552,7 @@ NODE_CLASS_MAPPINGS = { "Rodin3D_Detail": Rodin3D_Detail, "Rodin3D_Smooth": Rodin3D_Smooth, "Rodin3D_Sketch": Rodin3D_Sketch, + "Rodin3D_Gen2": Rodin3D_Gen2, } # A dictionary that contains the friendly/humanly readable titles for the nodes @@ -472,4 +561,5 @@ NODE_DISPLAY_NAME_MAPPINGS = { "Rodin3D_Detail": "Rodin 3D Generate - Detail Generate", "Rodin3D_Smooth": "Rodin 3D Generate - Smooth Generate", "Rodin3D_Sketch": "Rodin 3D Generate - Sketch Generate", + "Rodin3D_Gen2": "Rodin 3D Generate - Gen-2 Generate", } From fd79d32f38fd24adca5a6e8214f05050f287c9db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Thu, 25 Sep 2025 01:59:29 +0300 Subject: [PATCH 317/759] Add new audio nodes (#9908) * Add new audio nodes - TrimAudioDuration - SplitAudioChannels - AudioConcat - AudioMerge - AudioAdjustVolume * Update nodes_audio.py * Add EmptyAudio -node * Change duration to Float (allows sub seconds) --- comfy_extras/nodes_audio.py | 223 ++++++++++++++++++++++++++++++++++++ 1 file changed, 223 insertions(+) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 3b23f65d8..51c8b9dd9 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -11,6 +11,7 @@ import json import random import hashlib import node_helpers +import logging from comfy.cli_args import args from comfy.comfy_types import FileLocator @@ -364,6 +365,216 @@ class RecordAudio: return (audio, ) +class TrimAudioDuration: + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "audio": ("AUDIO",), + "start_index": ("FLOAT", {"default": 0.0, "min": -0xffffffffffffffff, "max": 0xffffffffffffffff, "step": 0.01, "tooltip": "Start time in seconds, can be negative to count from the end (supports sub-seconds)."}), + "duration": ("FLOAT", {"default": 60.0, "min": 0.0, "step": 0.01, "tooltip": "Duration in seconds"}), + }, + } + + FUNCTION = "trim" + RETURN_TYPES = ("AUDIO",) + CATEGORY = "audio" + DESCRIPTION = "Trim audio tensor into chosen time range." + + def trim(self, audio, start_index, duration): + waveform = audio["waveform"] + sample_rate = audio["sample_rate"] + audio_length = waveform.shape[-1] + + if start_index < 0: + start_frame = audio_length + int(round(start_index * sample_rate)) + else: + start_frame = int(round(start_index * sample_rate)) + start_frame = max(0, min(start_frame, audio_length - 1)) + + end_frame = start_frame + int(round(duration * sample_rate)) + end_frame = max(0, min(end_frame, audio_length)) + + if start_frame >= end_frame: + raise ValueError("AudioTrim: Start time must be less than end time and be within the audio length.") + + return ({"waveform": waveform[..., start_frame:end_frame], "sample_rate": sample_rate},) + + +class SplitAudioChannels: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "audio": ("AUDIO",), + }} + + RETURN_TYPES = ("AUDIO", "AUDIO") + RETURN_NAMES = ("left", "right") + FUNCTION = "separate" + CATEGORY = "audio" + DESCRIPTION = "Separates the audio into left and right channels." + + def separate(self, audio): + waveform = audio["waveform"] + sample_rate = audio["sample_rate"] + + if waveform.shape[1] != 2: + raise ValueError("AudioSplit: Input audio has only one channel.") + + left_channel = waveform[..., 0:1, :] + right_channel = waveform[..., 1:2, :] + + return ({"waveform": left_channel, "sample_rate": sample_rate}, {"waveform": right_channel, "sample_rate": sample_rate}) + + +def match_audio_sample_rates(waveform_1, sample_rate_1, waveform_2, sample_rate_2): + if sample_rate_1 != sample_rate_2: + if sample_rate_1 > sample_rate_2: + waveform_2 = torchaudio.functional.resample(waveform_2, sample_rate_2, sample_rate_1) + output_sample_rate = sample_rate_1 + logging.info(f"Resampling audio2 from {sample_rate_2}Hz to {sample_rate_1}Hz for merging.") + else: + waveform_1 = torchaudio.functional.resample(waveform_1, sample_rate_1, sample_rate_2) + output_sample_rate = sample_rate_2 + logging.info(f"Resampling audio1 from {sample_rate_1}Hz to {sample_rate_2}Hz for merging.") + else: + output_sample_rate = sample_rate_1 + return waveform_1, waveform_2, output_sample_rate + + +class AudioConcat: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "audio1": ("AUDIO",), + "audio2": ("AUDIO",), + "direction": (['after', 'before'], {"default": 'after', "tooltip": "Whether to append audio2 after or before audio1."}), + }} + + RETURN_TYPES = ("AUDIO",) + FUNCTION = "concat" + CATEGORY = "audio" + DESCRIPTION = "Concatenates the audio1 to audio2 in the specified direction." + + def concat(self, audio1, audio2, direction): + waveform_1 = audio1["waveform"] + waveform_2 = audio2["waveform"] + sample_rate_1 = audio1["sample_rate"] + sample_rate_2 = audio2["sample_rate"] + + if waveform_1.shape[1] == 1: + waveform_1 = waveform_1.repeat(1, 2, 1) + logging.info("AudioConcat: Converted mono audio1 to stereo by duplicating the channel.") + if waveform_2.shape[1] == 1: + waveform_2 = waveform_2.repeat(1, 2, 1) + logging.info("AudioConcat: Converted mono audio2 to stereo by duplicating the channel.") + + waveform_1, waveform_2, output_sample_rate = match_audio_sample_rates(waveform_1, sample_rate_1, waveform_2, sample_rate_2) + + if direction == 'after': + concatenated_audio = torch.cat((waveform_1, waveform_2), dim=2) + elif direction == 'before': + concatenated_audio = torch.cat((waveform_2, waveform_1), dim=2) + + return ({"waveform": concatenated_audio, "sample_rate": output_sample_rate},) + + +class AudioMerge: + @classmethod + def INPUT_TYPES(cls): + return { + "required": { + "audio1": ("AUDIO",), + "audio2": ("AUDIO",), + "merge_method": (["add", "mean", "subtract", "multiply"], {"tooltip": "The method used to combine the audio waveforms."}), + }, + } + + FUNCTION = "merge" + RETURN_TYPES = ("AUDIO",) + CATEGORY = "audio" + DESCRIPTION = "Combine two audio tracks by overlaying their waveforms." + + def merge(self, audio1, audio2, merge_method): + waveform_1 = audio1["waveform"] + waveform_2 = audio2["waveform"] + sample_rate_1 = audio1["sample_rate"] + sample_rate_2 = audio2["sample_rate"] + + waveform_1, waveform_2, output_sample_rate = match_audio_sample_rates(waveform_1, sample_rate_1, waveform_2, sample_rate_2) + + length_1 = waveform_1.shape[-1] + length_2 = waveform_2.shape[-1] + + if length_2 > length_1: + logging.info(f"AudioMerge: Trimming audio2 from {length_2} to {length_1} samples to match audio1 length.") + waveform_2 = waveform_2[..., :length_1] + elif length_2 < length_1: + logging.info(f"AudioMerge: Padding audio2 from {length_2} to {length_1} samples to match audio1 length.") + pad_shape = list(waveform_2.shape) + pad_shape[-1] = length_1 - length_2 + pad_tensor = torch.zeros(pad_shape, dtype=waveform_2.dtype, device=waveform_2.device) + waveform_2 = torch.cat((waveform_2, pad_tensor), dim=-1) + + if merge_method == "add": + waveform = waveform_1 + waveform_2 + elif merge_method == "subtract": + waveform = waveform_1 - waveform_2 + elif merge_method == "multiply": + waveform = waveform_1 * waveform_2 + elif merge_method == "mean": + waveform = (waveform_1 + waveform_2) / 2 + + max_val = waveform.abs().max() + if max_val > 1.0: + waveform = waveform / max_val + + return ({"waveform": waveform, "sample_rate": output_sample_rate},) + + +class AudioAdjustVolume: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "audio": ("AUDIO",), + "volume": ("INT", {"default": 1.0, "min": -100, "max": 100, "tooltip": "Volume adjustment in decibels (dB). 0 = no change, +6 = double, -6 = half, etc"}), + }} + + RETURN_TYPES = ("AUDIO",) + FUNCTION = "adjust_volume" + CATEGORY = "audio" + + def adjust_volume(self, audio, volume): + if volume == 0: + return (audio,) + waveform = audio["waveform"] + sample_rate = audio["sample_rate"] + + gain = 10 ** (volume / 20) + waveform = waveform * gain + + return ({"waveform": waveform, "sample_rate": sample_rate},) + + +class EmptyAudio: + @classmethod + def INPUT_TYPES(s): + return {"required": { + "duration": ("FLOAT", {"default": 60.0, "min": 0.0, "max": 0xffffffffffffffff, "step": 0.01, "tooltip": "Duration of the empty audio clip in seconds"}), + "sample_rate": ("INT", {"default": 44100, "tooltip": "Sample rate of the empty audio clip."}), + "channels": ("INT", {"default": 2, "min": 1, "max": 2, "tooltip": "Number of audio channels (1 for mono, 2 for stereo)."}), + }} + + RETURN_TYPES = ("AUDIO",) + FUNCTION = "create_empty_audio" + CATEGORY = "audio" + + def create_empty_audio(self, duration, sample_rate, channels): + num_samples = int(round(duration * sample_rate)) + waveform = torch.zeros((1, channels, num_samples), dtype=torch.float32) + return ({"waveform": waveform, "sample_rate": sample_rate},) + + NODE_CLASS_MAPPINGS = { "EmptyLatentAudio": EmptyLatentAudio, "VAEEncodeAudio": VAEEncodeAudio, @@ -375,6 +586,12 @@ NODE_CLASS_MAPPINGS = { "PreviewAudio": PreviewAudio, "ConditioningStableAudio": ConditioningStableAudio, "RecordAudio": RecordAudio, + "TrimAudioDuration": TrimAudioDuration, + "SplitAudioChannels": SplitAudioChannels, + "AudioConcat": AudioConcat, + "AudioMerge": AudioMerge, + "AudioAdjustVolume": AudioAdjustVolume, + "EmptyAudio": EmptyAudio, } NODE_DISPLAY_NAME_MAPPINGS = { @@ -387,4 +604,10 @@ NODE_DISPLAY_NAME_MAPPINGS = { "SaveAudioMP3": "Save Audio (MP3)", "SaveAudioOpus": "Save Audio (Opus)", "RecordAudio": "Record Audio", + "TrimAudioDuration": "Trim Audio Duration", + "SplitAudioChannels": "Split Audio Channels", + "AudioConcat": "Audio Concat", + "AudioMerge": "Audio Merge", + "AudioAdjustVolume": "Audio Adjust Volume", + "EmptyAudio": "Empty Audio", } From fccab99ec0fcd13e80fa59bc73bccff31f9450ca Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 24 Sep 2025 17:09:42 -0700 Subject: [PATCH 318/759] Fix issue with .view() in HuMo. (#10014) --- comfy/ldm/wan/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 2dac5980c..54616e6eb 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -1355,7 +1355,7 @@ class WanT2VCrossAttentionGather(WanSelfAttention): x = optimized_attention(q, k, v, heads=self.num_heads, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options) - x = x.transpose(1, 2).view(b, -1, n, d).flatten(2) + x = x.transpose(1, 2).reshape(b, -1, n * d) x = self.o(x) return x From c8d2117f02bcad6d8316ffd8273bdc27adf83b44 Mon Sep 17 00:00:00 2001 From: Guy Niv <43928922+guyniv@users.noreply.github.com> Date: Thu, 25 Sep 2025 05:35:12 +0300 Subject: [PATCH 319/759] Fix memory leak by properly detaching model finalizer (#9979) When unloading models in load_models_gpu(), the model finalizer was not being explicitly detached, leading to a memory leak. This caused linear memory consumption increase over time as models are repeatedly loaded and unloaded. This change prevents orphaned finalizer references from accumulating in memory during model switching operations. --- comfy/model_management.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index d880f1970..c5b817b62 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -645,7 +645,9 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu if loaded_model.model.is_clone(current_loaded_models[i].model): to_unload = [i] + to_unload for i in to_unload: - current_loaded_models.pop(i).model.detach(unpatch_all=False) + model_to_unload = current_loaded_models.pop(i) + model_to_unload.model.detach(unpatch_all=False) + model_to_unload.model_finalizer.detach() total_memory_required = {} for loaded_model in models_to_load: From ce4cb2389c8ce63cf8735f200b8672a2c1be0950 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 25 Sep 2025 14:20:13 -0700 Subject: [PATCH 320/759] Make LatentCompositeMasked work with basic video latents. (#10023) --- comfy_extras/nodes_mask.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index 2b0f8dd5d..a5e405008 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -12,35 +12,38 @@ from nodes import MAX_RESOLUTION def composite(destination, source, x, y, mask = None, multiplier = 8, resize_source = False): source = source.to(destination.device) if resize_source: - source = torch.nn.functional.interpolate(source, size=(destination.shape[2], destination.shape[3]), mode="bilinear") + source = torch.nn.functional.interpolate(source, size=(destination.shape[-2], destination.shape[-1]), mode="bilinear") source = comfy.utils.repeat_to_batch_size(source, destination.shape[0]) - x = max(-source.shape[3] * multiplier, min(x, destination.shape[3] * multiplier)) - y = max(-source.shape[2] * multiplier, min(y, destination.shape[2] * multiplier)) + x = max(-source.shape[-1] * multiplier, min(x, destination.shape[-1] * multiplier)) + y = max(-source.shape[-2] * multiplier, min(y, destination.shape[-2] * multiplier)) left, top = (x // multiplier, y // multiplier) - right, bottom = (left + source.shape[3], top + source.shape[2],) + right, bottom = (left + source.shape[-1], top + source.shape[-2],) if mask is None: mask = torch.ones_like(source) else: mask = mask.to(destination.device, copy=True) - mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(source.shape[2], source.shape[3]), mode="bilinear") + mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(source.shape[-2], source.shape[-1]), mode="bilinear") mask = comfy.utils.repeat_to_batch_size(mask, source.shape[0]) # calculate the bounds of the source that will be overlapping the destination # this prevents the source trying to overwrite latent pixels that are out of bounds # of the destination - visible_width, visible_height = (destination.shape[3] - left + min(0, x), destination.shape[2] - top + min(0, y),) + visible_width, visible_height = (destination.shape[-1] - left + min(0, x), destination.shape[-2] - top + min(0, y),) mask = mask[:, :, :visible_height, :visible_width] + if mask.ndim < source.ndim: + mask = mask.unsqueeze(1) + inverse_mask = torch.ones_like(mask) - mask - source_portion = mask * source[:, :, :visible_height, :visible_width] - destination_portion = inverse_mask * destination[:, :, top:bottom, left:right] + source_portion = mask * source[..., :visible_height, :visible_width] + destination_portion = inverse_mask * destination[..., top:bottom, left:right] - destination[:, :, top:bottom, left:right] = source_portion + destination_portion + destination[..., top:bottom, left:right] = source_portion + destination_portion return destination class LatentCompositeMasked: From 2b7f9a8196304badb5fe58e5c734e4b182ad0fdf Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 26 Sep 2025 11:12:43 -0700 Subject: [PATCH 321/759] Fix the failing unit test. (#10037) --- .github/workflows/test-unit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-unit.yml b/.github/workflows/test-unit.yml index 78c918031..00caf5b8a 100644 --- a/.github/workflows/test-unit.yml +++ b/.github/workflows/test-unit.yml @@ -10,7 +10,7 @@ jobs: test: strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] + os: [ubuntu-latest, windows-2022, macos-latest] runs-on: ${{ matrix.os }} continue-on-error: true steps: From c4a46e943c12c7f3f6ac72f8fb51caad514ec9b6 Mon Sep 17 00:00:00 2001 From: Yoland Yan <4950057+yoland68@users.noreply.github.com> Date: Fri, 26 Sep 2025 14:08:16 -0700 Subject: [PATCH 322/759] Add @kosinkadink as code owner (#10041) Updated CODEOWNERS to include @kosinkadink as a code owner. --- CODEOWNERS | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index c8acd66d5..b7aca9b26 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,25 +1,3 @@ # Admins * @comfyanonymous - -# Note: Github teams syntax cannot be used here as the repo is not owned by Comfy-Org. -# Inlined the team members for now. - -# Maintainers -*.md @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill -/tests/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill -/tests-unit/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill -/notebooks/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill -/script_examples/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill -/.github/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill -/requirements.txt @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill -/pyproject.toml @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne @guill - -# Python web server -/api_server/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill -/app/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill -/utils/ @yoland68 @robinjhuang @webfiltered @pythongosssss @ltdrdata @christian-byrne @guill - -# Node developers -/comfy_extras/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill -/comfy/comfy_types/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill -/comfy_api_nodes/ @yoland68 @robinjhuang @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne @guill +* @kosinkadink From 76eb1d72c3e5bef51d6ca8a26bf996972d3f6d1a Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 00:10:49 +0300 Subject: [PATCH 323/759] convert nodes_rebatch.py to V3 schema (#9945) --- comfy_extras/nodes_rebatch.py | 97 ++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 41 deletions(-) diff --git a/comfy_extras/nodes_rebatch.py b/comfy_extras/nodes_rebatch.py index e29cb9ed1..5f4e82aef 100644 --- a/comfy_extras/nodes_rebatch.py +++ b/comfy_extras/nodes_rebatch.py @@ -1,18 +1,25 @@ +from typing_extensions import override import torch -class LatentRebatch: +from comfy_api.latest import ComfyExtension, io + + +class LatentRebatch(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "latents": ("LATENT",), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }} - RETURN_TYPES = ("LATENT",) - INPUT_IS_LIST = True - OUTPUT_IS_LIST = (True, ) - - FUNCTION = "rebatch" - - CATEGORY = "latent/batch" + def define_schema(cls): + return io.Schema( + node_id="RebatchLatents", + display_name="Rebatch Latents", + category="latent/batch", + is_input_list=True, + inputs=[ + io.Latent.Input("latents"), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(is_output_list=True), + ], + ) @staticmethod def get_batch(latents, list_ind, offset): @@ -53,7 +60,8 @@ class LatentRebatch: result = [torch.cat((b1, b2)) if torch.is_tensor(b1) else b1 + b2 for b1, b2 in zip(batch1, batch2)] return result - def rebatch(self, latents, batch_size): + @classmethod + def execute(cls, latents, batch_size): batch_size = batch_size[0] output_list = [] @@ -63,24 +71,24 @@ class LatentRebatch: for i in range(len(latents)): # fetch new entry of list #samples, masks, indices = self.get_batch(latents, i) - next_batch = self.get_batch(latents, i, processed) + next_batch = cls.get_batch(latents, i, processed) processed += len(next_batch[2]) # set to current if current is None if current_batch[0] is None: current_batch = next_batch # add previous to list if dimensions do not match elif next_batch[0].shape[-1] != current_batch[0].shape[-1] or next_batch[0].shape[-2] != current_batch[0].shape[-2]: - sliced, _ = self.slice_batch(current_batch, 1, batch_size) + sliced, _ = cls.slice_batch(current_batch, 1, batch_size) output_list.append({'samples': sliced[0][0], 'noise_mask': sliced[1][0], 'batch_index': sliced[2][0]}) current_batch = next_batch # cat if everything checks out else: - current_batch = self.cat_batch(current_batch, next_batch) + current_batch = cls.cat_batch(current_batch, next_batch) # add to list if dimensions gone above target batch size if current_batch[0].shape[0] > batch_size: num = current_batch[0].shape[0] // batch_size - sliced, remainder = self.slice_batch(current_batch, num, batch_size) + sliced, remainder = cls.slice_batch(current_batch, num, batch_size) for i in range(num): output_list.append({'samples': sliced[0][i], 'noise_mask': sliced[1][i], 'batch_index': sliced[2][i]}) @@ -89,7 +97,7 @@ class LatentRebatch: #add remainder if current_batch[0] is not None: - sliced, _ = self.slice_batch(current_batch, 1, batch_size) + sliced, _ = cls.slice_batch(current_batch, 1, batch_size) output_list.append({'samples': sliced[0][0], 'noise_mask': sliced[1][0], 'batch_index': sliced[2][0]}) #get rid of empty masks @@ -97,23 +105,27 @@ class LatentRebatch: if s['noise_mask'].mean() == 1.0: del s['noise_mask'] - return (output_list,) + return io.NodeOutput(output_list) -class ImageRebatch: +class ImageRebatch(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "images": ("IMAGE",), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - }} - RETURN_TYPES = ("IMAGE",) - INPUT_IS_LIST = True - OUTPUT_IS_LIST = (True, ) + def define_schema(cls): + return io.Schema( + node_id="RebatchImages", + display_name="Rebatch Images", + category="image/batch", + is_input_list=True, + inputs=[ + io.Image.Input("images"), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Image.Output(is_output_list=True), + ], + ) - FUNCTION = "rebatch" - - CATEGORY = "image/batch" - - def rebatch(self, images, batch_size): + @classmethod + def execute(cls, images, batch_size): batch_size = batch_size[0] output_list = [] @@ -125,14 +137,17 @@ class ImageRebatch: for i in range(0, len(all_images), batch_size): output_list.append(torch.cat(all_images[i:i+batch_size], dim=0)) - return (output_list,) + return io.NodeOutput(output_list) -NODE_CLASS_MAPPINGS = { - "RebatchLatents": LatentRebatch, - "RebatchImages": ImageRebatch, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "RebatchLatents": "Rebatch Latents", - "RebatchImages": "Rebatch Images", -} +class RebatchExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + LatentRebatch, + ImageRebatch, + ] + + +async def comfy_entrypoint() -> RebatchExtension: + return RebatchExtension() From 7ea173c1873ec22df6edabc80a912a08ae2d521b Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 00:12:04 +0300 Subject: [PATCH 324/759] convert nodes_fresca.py to V3 schema (#9951) --- comfy_extras/nodes_fresca.py | 61 +++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/comfy_extras/nodes_fresca.py b/comfy_extras/nodes_fresca.py index 65c2d0d0e..f308eb0c1 100644 --- a/comfy_extras/nodes_fresca.py +++ b/comfy_extras/nodes_fresca.py @@ -1,6 +1,8 @@ # Code based on https://github.com/WikiChao/FreSca (MIT License) import torch import torch.fft as fft +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io def Fourier_filter(x, scale_low=1.0, scale_high=1.5, freq_cutoff=20): @@ -51,25 +53,31 @@ def Fourier_filter(x, scale_low=1.0, scale_high=1.5, freq_cutoff=20): return x_filtered -class FreSca: +class FreSca(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ("MODEL",), - "scale_low": ("FLOAT", {"default": 1.0, "min": 0, "max": 10, "step": 0.01, - "tooltip": "Scaling factor for low-frequency components"}), - "scale_high": ("FLOAT", {"default": 1.25, "min": 0, "max": 10, "step": 0.01, - "tooltip": "Scaling factor for high-frequency components"}), - "freq_cutoff": ("INT", {"default": 20, "min": 1, "max": 10000, "step": 1, - "tooltip": "Number of frequency indices around center to consider as low-frequency"}), - } - } - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" - CATEGORY = "_for_testing" - DESCRIPTION = "Applies frequency-dependent scaling to the guidance" - def patch(self, model, scale_low, scale_high, freq_cutoff): + def define_schema(cls): + return io.Schema( + node_id="FreSca", + display_name="FreSca", + category="_for_testing", + description="Applies frequency-dependent scaling to the guidance", + inputs=[ + io.Model.Input("model"), + io.Float.Input("scale_low", default=1.0, min=0, max=10, step=0.01, + tooltip="Scaling factor for low-frequency components"), + io.Float.Input("scale_high", default=1.25, min=0, max=10, step=0.01, + tooltip="Scaling factor for high-frequency components"), + io.Int.Input("freq_cutoff", default=20, min=1, max=10000, step=1, + tooltip="Number of frequency indices around center to consider as low-frequency"), + ], + outputs=[ + io.Model.Output(), + ], + is_experimental=True, + ) + + @classmethod + def execute(cls, model, scale_low, scale_high, freq_cutoff): def custom_cfg_function(args): conds_out = args["conds_out"] if len(conds_out) <= 1 or None in args["conds"][:2]: @@ -91,13 +99,16 @@ class FreSca: m = model.clone() m.set_model_sampler_pre_cfg_function(custom_cfg_function) - return (m,) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "FreSca": FreSca, -} +class FreScaExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + FreSca, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "FreSca": "FreSca", -} + +async def comfy_entrypoint() -> FreScaExtension: + return FreScaExtension() From 80718908a9ac1045ece84285ca568511dcc9bc46 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 00:12:38 +0300 Subject: [PATCH 325/759] convert nodes_sdupscale.py to V3 schema (#9943) --- comfy_extras/nodes_sdupscale.py | 54 +++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/comfy_extras/nodes_sdupscale.py b/comfy_extras/nodes_sdupscale.py index bba67e8dd..31b373370 100644 --- a/comfy_extras/nodes_sdupscale.py +++ b/comfy_extras/nodes_sdupscale.py @@ -1,23 +1,31 @@ +from typing_extensions import override + import torch import comfy.utils +from comfy_api.latest import ComfyExtension, io -class SD_4XUpscale_Conditioning: +class SD_4XUpscale_Conditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "images": ("IMAGE",), - "positive": ("CONDITIONING",), - "negative": ("CONDITIONING",), - "scale_ratio": ("FLOAT", {"default": 4.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "noise_augmentation": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}), - }} - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") + def define_schema(cls): + return io.Schema( + node_id="SD_4XUpscale_Conditioning", + category="conditioning/upscale_diffusion", + inputs=[ + io.Image.Input("images"), + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Float.Input("scale_ratio", default=4.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("noise_augmentation", default=0.0, min=0.0, max=1.0, step=0.001), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - FUNCTION = "encode" - - CATEGORY = "conditioning/upscale_diffusion" - - def encode(self, images, positive, negative, scale_ratio, noise_augmentation): + @classmethod + def execute(cls, images, positive, negative, scale_ratio, noise_augmentation): width = max(1, round(images.shape[-2] * scale_ratio)) height = max(1, round(images.shape[-3] * scale_ratio)) @@ -39,8 +47,16 @@ class SD_4XUpscale_Conditioning: out_cn.append(n) latent = torch.zeros([images.shape[0], 4, height // 4, width // 4]) - return (out_cp, out_cn, {"samples":latent}) + return io.NodeOutput(out_cp, out_cn, {"samples":latent}) -NODE_CLASS_MAPPINGS = { - "SD_4XUpscale_Conditioning": SD_4XUpscale_Conditioning, -} + +class SdUpscaleExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + SD_4XUpscale_Conditioning, + ] + + +async def comfy_entrypoint() -> SdUpscaleExtension: + return SdUpscaleExtension() From a061b06321b4e91d05c7c436b1e9b188360c5377 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 00:13:05 +0300 Subject: [PATCH 326/759] convert nodes_tcfg.py to V3 schema (#9942) --- comfy_extras/nodes_tcfg.py | 51 +++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/comfy_extras/nodes_tcfg.py b/comfy_extras/nodes_tcfg.py index 35b89a73f..1a6767770 100644 --- a/comfy_extras/nodes_tcfg.py +++ b/comfy_extras/nodes_tcfg.py @@ -1,8 +1,9 @@ # TCFG: Tangential Damping Classifier-free Guidance - (arXiv: https://arxiv.org/abs/2503.18137) +from typing_extensions import override import torch -from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict +from comfy_api.latest import ComfyExtension, io def score_tangential_damping(cond_score: torch.Tensor, uncond_score: torch.Tensor) -> torch.Tensor: @@ -26,23 +27,24 @@ def score_tangential_damping(cond_score: torch.Tensor, uncond_score: torch.Tenso return uncond_score_td.reshape_as(uncond_score).to(uncond_score.dtype) -class TCFG(ComfyNodeABC): +class TCFG(io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "model": (IO.MODEL, {}), - } - } + def define_schema(cls): + return io.Schema( + node_id="TCFG", + display_name="Tangential Damping CFG", + category="advanced/guidance", + description="TCFG – Tangential Damping CFG (2503.18137)\n\nRefine the uncond (negative) to align with the cond (positive) for improving quality.", + inputs=[ + io.Model.Input("model"), + ], + outputs=[ + io.Model.Output(display_name="patched_model"), + ], + ) - RETURN_TYPES = (IO.MODEL,) - RETURN_NAMES = ("patched_model",) - FUNCTION = "patch" - - CATEGORY = "advanced/guidance" - DESCRIPTION = "TCFG – Tangential Damping CFG (2503.18137)\n\nRefine the uncond (negative) to align with the cond (positive) for improving quality." - - def patch(self, model): + @classmethod + def execute(cls, model): m = model.clone() def tangential_damping_cfg(args): @@ -59,13 +61,16 @@ class TCFG(ComfyNodeABC): return [cond_pred, uncond_pred_td] + conds_out[2:] m.set_model_sampler_pre_cfg_function(tangential_damping_cfg) - return (m,) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "TCFG": TCFG, -} +class TcfgExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TCFG, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "TCFG": "Tangential Damping CFG", -} + +async def comfy_entrypoint() -> TcfgExtension: + return TcfgExtension() From d20576e6a3527d0763ba8d7a72c70ee66829690a Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 00:13:52 +0300 Subject: [PATCH 327/759] convert nodes_sag.py to V3 schema (#9940) --- comfy_extras/nodes_sag.py | 50 +++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/comfy_extras/nodes_sag.py b/comfy_extras/nodes_sag.py index 1bd8d7364..0f47db30b 100644 --- a/comfy_extras/nodes_sag.py +++ b/comfy_extras/nodes_sag.py @@ -2,10 +2,13 @@ import torch from torch import einsum import torch.nn.functional as F import math +from typing_extensions import override from einops import rearrange, repeat from comfy.ldm.modules.attention import optimized_attention import comfy.samplers +from comfy_api.latest import ComfyExtension, io + # from comfy/ldm/modules/attention.py # but modified to return attention scores as well as output @@ -104,19 +107,26 @@ def gaussian_blur_2d(img, kernel_size, sigma): img = F.conv2d(img, kernel2d, groups=img.shape[-3]) return img -class SelfAttentionGuidance: +class SelfAttentionGuidance(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "scale": ("FLOAT", {"default": 0.5, "min": -2.0, "max": 5.0, "step": 0.01}), - "blur_sigma": ("FLOAT", {"default": 2.0, "min": 0.0, "max": 10.0, "step": 0.1}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="SelfAttentionGuidance", + display_name="Self-Attention Guidance", + category="_for_testing", + inputs=[ + io.Model.Input("model"), + io.Float.Input("scale", default=0.5, min=-2.0, max=5.0, step=0.01), + io.Float.Input("blur_sigma", default=2.0, min=0.0, max=10.0, step=0.1), + ], + outputs=[ + io.Model.Output(), + ], + is_experimental=True, + ) - CATEGORY = "_for_testing" - - def patch(self, model, scale, blur_sigma): + @classmethod + def execute(cls, model, scale, blur_sigma): m = model.clone() attn_scores = None @@ -170,12 +180,16 @@ class SelfAttentionGuidance: # unet.mid_block.attentions[0].transformer_blocks[0].attn1.patch m.set_model_attn1_replace(attn_and_record, "middle", 0, 0) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "SelfAttentionGuidance": SelfAttentionGuidance, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "SelfAttentionGuidance": "Self-Attention Guidance", -} +class SagExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + SelfAttentionGuidance, + ] + + +async def comfy_entrypoint() -> SagExtension: + return SagExtension() From 2103e393350d297ef77497a1b14a8199d4a1f1b4 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 00:14:42 +0300 Subject: [PATCH 328/759] convert nodes_post_processing to V3 schema (#9491) --- comfy_extras/nodes_post_processing.py | 249 ++++++++++++-------------- 1 file changed, 111 insertions(+), 138 deletions(-) diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py index ed7a07152..34c388a5a 100644 --- a/comfy_extras/nodes_post_processing.py +++ b/comfy_extras/nodes_post_processing.py @@ -1,3 +1,4 @@ +from typing_extensions import override import numpy as np import torch import torch.nn.functional as F @@ -7,33 +8,27 @@ import math import comfy.utils import comfy.model_management import node_helpers +from comfy_api.latest import ComfyExtension, io -class Blend: - def __init__(self): - pass +class Blend(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="ImageBlend", + category="image/postprocessing", + inputs=[ + io.Image.Input("image1"), + io.Image.Input("image2"), + io.Float.Input("blend_factor", default=0.5, min=0.0, max=1.0, step=0.01), + io.Combo.Input("blend_mode", options=["normal", "multiply", "screen", "overlay", "soft_light", "difference"]), + ], + outputs=[ + io.Image.Output(), + ], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image1": ("IMAGE",), - "image2": ("IMAGE",), - "blend_factor": ("FLOAT", { - "default": 0.5, - "min": 0.0, - "max": 1.0, - "step": 0.01 - }), - "blend_mode": (["normal", "multiply", "screen", "overlay", "soft_light", "difference"],), - }, - } - - RETURN_TYPES = ("IMAGE",) - FUNCTION = "blend_images" - - CATEGORY = "image/postprocessing" - - def blend_images(self, image1: torch.Tensor, image2: torch.Tensor, blend_factor: float, blend_mode: str): + def execute(cls, image1: torch.Tensor, image2: torch.Tensor, blend_factor: float, blend_mode: str) -> io.NodeOutput: image1, image2 = node_helpers.image_alpha_fix(image1, image2) image2 = image2.to(image1.device) if image1.shape != image2.shape: @@ -41,12 +36,13 @@ class Blend: image2 = comfy.utils.common_upscale(image2, image1.shape[2], image1.shape[1], upscale_method='bicubic', crop='center') image2 = image2.permute(0, 2, 3, 1) - blended_image = self.blend_mode(image1, image2, blend_mode) + blended_image = cls.blend_mode(image1, image2, blend_mode) blended_image = image1 * (1 - blend_factor) + blended_image * blend_factor blended_image = torch.clamp(blended_image, 0, 1) - return (blended_image,) + return io.NodeOutput(blended_image) - def blend_mode(self, img1, img2, mode): + @classmethod + def blend_mode(cls, img1, img2, mode): if mode == "normal": return img2 elif mode == "multiply": @@ -56,13 +52,13 @@ class Blend: elif mode == "overlay": return torch.where(img1 <= 0.5, 2 * img1 * img2, 1 - 2 * (1 - img1) * (1 - img2)) elif mode == "soft_light": - return torch.where(img2 <= 0.5, img1 - (1 - 2 * img2) * img1 * (1 - img1), img1 + (2 * img2 - 1) * (self.g(img1) - img1)) + return torch.where(img2 <= 0.5, img1 - (1 - 2 * img2) * img1 * (1 - img1), img1 + (2 * img2 - 1) * (cls.g(img1) - img1)) elif mode == "difference": return img1 - img2 - else: - raise ValueError(f"Unsupported blend mode: {mode}") + raise ValueError(f"Unsupported blend mode: {mode}") - def g(self, x): + @classmethod + def g(cls, x): return torch.where(x <= 0.25, ((16 * x - 12) * x + 4) * x, torch.sqrt(x)) def gaussian_kernel(kernel_size: int, sigma: float, device=None): @@ -71,38 +67,26 @@ def gaussian_kernel(kernel_size: int, sigma: float, device=None): g = torch.exp(-(d * d) / (2.0 * sigma * sigma)) return g / g.sum() -class Blur: - def __init__(self): - pass +class Blur(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="ImageBlur", + category="image/postprocessing", + inputs=[ + io.Image.Input("image"), + io.Int.Input("blur_radius", default=1, min=1, max=31, step=1), + io.Float.Input("sigma", default=1.0, min=0.1, max=10.0, step=0.1), + ], + outputs=[ + io.Image.Output(), + ], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - "blur_radius": ("INT", { - "default": 1, - "min": 1, - "max": 31, - "step": 1 - }), - "sigma": ("FLOAT", { - "default": 1.0, - "min": 0.1, - "max": 10.0, - "step": 0.1 - }), - }, - } - - RETURN_TYPES = ("IMAGE",) - FUNCTION = "blur" - - CATEGORY = "image/postprocessing" - - def blur(self, image: torch.Tensor, blur_radius: int, sigma: float): + def execute(cls, image: torch.Tensor, blur_radius: int, sigma: float) -> io.NodeOutput: if blur_radius == 0: - return (image,) + return io.NodeOutput(image) image = image.to(comfy.model_management.get_torch_device()) batch_size, height, width, channels = image.shape @@ -115,31 +99,24 @@ class Blur: blurred = F.conv2d(padded_image, kernel, padding=kernel_size // 2, groups=channels)[:,:,blur_radius:-blur_radius, blur_radius:-blur_radius] blurred = blurred.permute(0, 2, 3, 1) - return (blurred.to(comfy.model_management.intermediate_device()),) + return io.NodeOutput(blurred.to(comfy.model_management.intermediate_device())) -class Quantize: - def __init__(self): - pass +class Quantize(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - "colors": ("INT", { - "default": 256, - "min": 1, - "max": 256, - "step": 1 - }), - "dither": (["none", "floyd-steinberg", "bayer-2", "bayer-4", "bayer-8", "bayer-16"],), - }, - } - - RETURN_TYPES = ("IMAGE",) - FUNCTION = "quantize" - - CATEGORY = "image/postprocessing" + def define_schema(cls): + return io.Schema( + node_id="ImageQuantize", + category="image/postprocessing", + inputs=[ + io.Image.Input("image"), + io.Int.Input("colors", default=256, min=1, max=256, step=1), + io.Combo.Input("dither", options=["none", "floyd-steinberg", "bayer-2", "bayer-4", "bayer-8", "bayer-16"]), + ], + outputs=[ + io.Image.Output(), + ], + ) @staticmethod def bayer(im, pal_im, order): @@ -167,7 +144,8 @@ class Quantize: im = im.quantize(palette=pal_im, dither=Image.Dither.NONE) return im - def quantize(self, image: torch.Tensor, colors: int, dither: str): + @classmethod + def execute(cls, image: torch.Tensor, colors: int, dither: str) -> io.NodeOutput: batch_size, height, width, _ = image.shape result = torch.zeros_like(image) @@ -187,46 +165,29 @@ class Quantize: quantized_array = torch.tensor(np.array(quantized_image.convert("RGB"))).float() / 255 result[b] = quantized_array - return (result,) + return io.NodeOutput(result) -class Sharpen: - def __init__(self): - pass +class Sharpen(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="ImageSharpen", + category="image/postprocessing", + inputs=[ + io.Image.Input("image"), + io.Int.Input("sharpen_radius", default=1, min=1, max=31, step=1), + io.Float.Input("sigma", default=1.0, min=0.1, max=10.0, step=0.01), + io.Float.Input("alpha", default=1.0, min=0.0, max=5.0, step=0.01), + ], + outputs=[ + io.Image.Output(), + ], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - "sharpen_radius": ("INT", { - "default": 1, - "min": 1, - "max": 31, - "step": 1 - }), - "sigma": ("FLOAT", { - "default": 1.0, - "min": 0.1, - "max": 10.0, - "step": 0.01 - }), - "alpha": ("FLOAT", { - "default": 1.0, - "min": 0.0, - "max": 5.0, - "step": 0.01 - }), - }, - } - - RETURN_TYPES = ("IMAGE",) - FUNCTION = "sharpen" - - CATEGORY = "image/postprocessing" - - def sharpen(self, image: torch.Tensor, sharpen_radius: int, sigma:float, alpha: float): + def execute(cls, image: torch.Tensor, sharpen_radius: int, sigma:float, alpha: float) -> io.NodeOutput: if sharpen_radius == 0: - return (image,) + return io.NodeOutput(image) batch_size, height, width, channels = image.shape image = image.to(comfy.model_management.get_torch_device()) @@ -245,23 +206,29 @@ class Sharpen: result = torch.clamp(sharpened, 0, 1) - return (result.to(comfy.model_management.intermediate_device()),) + return io.NodeOutput(result.to(comfy.model_management.intermediate_device())) -class ImageScaleToTotalPixels: +class ImageScaleToTotalPixels(io.ComfyNode): upscale_methods = ["nearest-exact", "bilinear", "area", "bicubic", "lanczos"] crop_methods = ["disabled", "center"] @classmethod - def INPUT_TYPES(s): - return {"required": { "image": ("IMAGE",), "upscale_method": (s.upscale_methods,), - "megapixels": ("FLOAT", {"default": 1.0, "min": 0.01, "max": 16.0, "step": 0.01}), - }} - RETURN_TYPES = ("IMAGE",) - FUNCTION = "upscale" + def define_schema(cls): + return io.Schema( + node_id="ImageScaleToTotalPixels", + category="image/upscaling", + inputs=[ + io.Image.Input("image"), + io.Combo.Input("upscale_method", options=cls.upscale_methods), + io.Float.Input("megapixels", default=1.0, min=0.01, max=16.0, step=0.01), + ], + outputs=[ + io.Image.Output(), + ], + ) - CATEGORY = "image/upscaling" - - def upscale(self, image, upscale_method, megapixels): + @classmethod + def execute(cls, image, upscale_method, megapixels) -> io.NodeOutput: samples = image.movedim(-1,1) total = int(megapixels * 1024 * 1024) @@ -271,12 +238,18 @@ class ImageScaleToTotalPixels: s = comfy.utils.common_upscale(samples, width, height, upscale_method, "disabled") s = s.movedim(1,-1) - return (s,) + return io.NodeOutput(s) -NODE_CLASS_MAPPINGS = { - "ImageBlend": Blend, - "ImageBlur": Blur, - "ImageQuantize": Quantize, - "ImageSharpen": Sharpen, - "ImageScaleToTotalPixels": ImageScaleToTotalPixels, -} +class PostProcessingExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + Blend, + Blur, + Quantize, + Sharpen, + ImageScaleToTotalPixels, + ] + +async def comfy_entrypoint() -> PostProcessingExtension: + return PostProcessingExtension() From cd66d72b464fd9d344baa426b50a5f0e5e512f99 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 00:15:44 +0300 Subject: [PATCH 329/759] convert CLIPTextEncodeSDXL nodes to V3 schema (#9716) --- comfy_extras/nodes_clip_sdxl.py | 93 +++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 38 deletions(-) diff --git a/comfy_extras/nodes_clip_sdxl.py b/comfy_extras/nodes_clip_sdxl.py index 14269caf3..520ff0e3c 100644 --- a/comfy_extras/nodes_clip_sdxl.py +++ b/comfy_extras/nodes_clip_sdxl.py @@ -1,43 +1,52 @@ -from nodes import MAX_RESOLUTION +from typing_extensions import override -class CLIPTextEncodeSDXLRefiner: +import nodes +from comfy_api.latest import ComfyExtension, io + + +class CLIPTextEncodeSDXLRefiner(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "ascore": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 1000.0, "step": 0.01}), - "width": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - "height": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - "text": ("STRING", {"multiline": True, "dynamicPrompts": True}), "clip": ("CLIP", ), - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodeSDXLRefiner", + category="advanced/conditioning", + inputs=[ + io.Float.Input("ascore", default=6.0, min=0.0, max=1000.0, step=0.01), + io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("height", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.String.Input("text", multiline=True, dynamic_prompts=True), + io.Clip.Input("clip"), + ], + outputs=[io.Conditioning.Output()], + ) - CATEGORY = "advanced/conditioning" - - def encode(self, clip, ascore, width, height, text): + @classmethod + def execute(cls, clip, ascore, width, height, text) -> io.NodeOutput: tokens = clip.tokenize(text) - return (clip.encode_from_tokens_scheduled(tokens, add_dict={"aesthetic_score": ascore, "width": width, "height": height}), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens, add_dict={"aesthetic_score": ascore, "width": width, "height": height})) -class CLIPTextEncodeSDXL: +class CLIPTextEncodeSDXL(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "width": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - "height": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - "crop_w": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION}), - "crop_h": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION}), - "target_width": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - "target_height": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - "text_g": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "text_l": ("STRING", {"multiline": True, "dynamicPrompts": True}), - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodeSDXL", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("height", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("crop_w", default=0, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("crop_h", default=0, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("target_width", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("target_height", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.String.Input("text_g", multiline=True, dynamic_prompts=True), + io.String.Input("text_l", multiline=True, dynamic_prompts=True), + ], + outputs=[io.Conditioning.Output()], + ) - CATEGORY = "advanced/conditioning" - - def encode(self, clip, width, height, crop_w, crop_h, target_width, target_height, text_g, text_l): + @classmethod + def execute(cls, clip, width, height, crop_w, crop_h, target_width, target_height, text_g, text_l) -> io.NodeOutput: tokens = clip.tokenize(text_g) tokens["l"] = clip.tokenize(text_l)["l"] if len(tokens["l"]) != len(tokens["g"]): @@ -46,9 +55,17 @@ class CLIPTextEncodeSDXL: tokens["l"] += empty["l"] while len(tokens["l"]) > len(tokens["g"]): tokens["g"] += empty["g"] - return (clip.encode_from_tokens_scheduled(tokens, add_dict={"width": width, "height": height, "crop_w": crop_w, "crop_h": crop_h, "target_width": target_width, "target_height": target_height}), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens, add_dict={"width": width, "height": height, "crop_w": crop_w, "crop_h": crop_h, "target_width": target_width, "target_height": target_height})) -NODE_CLASS_MAPPINGS = { - "CLIPTextEncodeSDXLRefiner": CLIPTextEncodeSDXLRefiner, - "CLIPTextEncodeSDXL": CLIPTextEncodeSDXL, -} + +class ClipSdxlExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + CLIPTextEncodeSDXLRefiner, + CLIPTextEncodeSDXL, + ] + + +async def comfy_entrypoint() -> ClipSdxlExtension: + return ClipSdxlExtension() From 1e098d61327e1c02c1a47b2626514474aa8e3c7e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 26 Sep 2025 15:34:17 -0700 Subject: [PATCH 330/759] Don't add template to qwen2.5vl when template is in prompt. (#10043) Make the hunyuan image refiner template_end 36. --- comfy/text_encoders/hunyuan_image.py | 8 ++++- comfy/text_encoders/qwen_image.py | 46 +++++++++++++++++----------- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/comfy/text_encoders/hunyuan_image.py b/comfy/text_encoders/hunyuan_image.py index 699eddc33..ff04726e1 100644 --- a/comfy/text_encoders/hunyuan_image.py +++ b/comfy/text_encoders/hunyuan_image.py @@ -63,7 +63,13 @@ class HunyuanImageTEModel(QwenImageTEModel): self.byt5_small = None def encode_token_weights(self, token_weight_pairs): - cond, p, extra = super().encode_token_weights(token_weight_pairs) + tok_pairs = token_weight_pairs["qwen25_7b"][0] + template_end = -1 + if tok_pairs[0][0] == 27: + if len(tok_pairs) > 36: # refiner prompt uses a fixed 36 template_end + template_end = 36 + + cond, p, extra = super().encode_token_weights(token_weight_pairs, template_end=template_end) if self.byt5_small is not None and "byt5" in token_weight_pairs: out = self.byt5_small.encode_token_weights(token_weight_pairs["byt5"]) extra["conditioning_byt5small"] = out[0] diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py index 6646b1003..40fa67937 100644 --- a/comfy/text_encoders/qwen_image.py +++ b/comfy/text_encoders/qwen_image.py @@ -18,13 +18,22 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer): self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n" def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs): - if llama_template is None: - if len(images) > 0: - llama_text = self.llama_template_images.format(text) - else: - llama_text = self.llama_template.format(text) + skip_template = False + if text.startswith('<|im_start|>'): + skip_template = True + if text.startswith('<|start_header_id|>'): + skip_template = True + + if skip_template: + llama_text = text else: - llama_text = llama_template.format(text) + if llama_template is None: + if len(images) > 0: + llama_text = self.llama_template_images.format(text) + else: + llama_text = self.llama_template.format(text) + else: + llama_text = llama_template.format(text) tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs) key_name = next(iter(tokens)) embed_count = 0 @@ -47,22 +56,23 @@ class QwenImageTEModel(sd1_clip.SD1ClipModel): def __init__(self, device="cpu", dtype=None, model_options={}): super().__init__(device=device, dtype=dtype, name="qwen25_7b", clip_model=Qwen25_7BVLIModel, model_options=model_options) - def encode_token_weights(self, token_weight_pairs): + def encode_token_weights(self, token_weight_pairs, template_end=-1): out, pooled, extra = super().encode_token_weights(token_weight_pairs) tok_pairs = token_weight_pairs["qwen25_7b"][0] count_im_start = 0 - for i, v in enumerate(tok_pairs): - elem = v[0] - if not torch.is_tensor(elem): - if isinstance(elem, numbers.Integral): - if elem == 151644 and count_im_start < 2: - template_end = i - count_im_start += 1 + if template_end == -1: + for i, v in enumerate(tok_pairs): + elem = v[0] + if not torch.is_tensor(elem): + if isinstance(elem, numbers.Integral): + if elem == 151644 and count_im_start < 2: + template_end = i + count_im_start += 1 - if out.shape[1] > (template_end + 3): - if tok_pairs[template_end + 1][0] == 872: - if tok_pairs[template_end + 2][0] == 198: - template_end += 3 + if out.shape[1] > (template_end + 3): + if tok_pairs[template_end + 1][0] == 872: + if tok_pairs[template_end + 2][0] == 198: + template_end += 3 out = out[:, template_end:] From 196954ab8c55bc4ac48113686a57ce250677c7b5 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Fri, 26 Sep 2025 19:55:03 -0700 Subject: [PATCH 331/759] Add 'input_cond' and 'input_uncond' to the args dictionary passed into sampler_cfg_function (#10044) --- comfy/samplers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/samplers.py b/comfy/samplers.py index b3202cec6..c59e296a1 100755 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -360,7 +360,7 @@ def calc_cond_uncond_batch(model, cond, uncond, x_in, timestep, model_options): def cfg_function(model, cond_pred, uncond_pred, cond_scale, x, timestep, model_options={}, cond=None, uncond=None): if "sampler_cfg_function" in model_options: args = {"cond": x - cond_pred, "uncond": x - uncond_pred, "cond_scale": cond_scale, "timestep": timestep, "input": x, "sigma": timestep, - "cond_denoised": cond_pred, "uncond_denoised": uncond_pred, "model": model, "model_options": model_options} + "cond_denoised": cond_pred, "uncond_denoised": uncond_pred, "model": model, "model_options": model_options, "input_cond": cond, "input_uncond": uncond} cfg_result = x - model_options["sampler_cfg_function"](args) else: cfg_result = uncond_pred + (cond_pred - uncond_pred) * cond_scale @@ -390,7 +390,7 @@ def sampling_function(model, x, timestep, uncond, cond, cond_scale, model_option for fn in model_options.get("sampler_pre_cfg_function", []): args = {"conds":conds, "conds_out": out, "cond_scale": cond_scale, "timestep": timestep, "input": x, "sigma": timestep, "model": model, "model_options": model_options} - out = fn(args) + out = fn(args) return cfg_function(model, out[0], out[1], cond_scale, x, timestep, model_options=model_options, cond=cond, uncond=uncond_) From 0572029fee48741a8cf34a8e4d485898c5ab5dfd Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sat, 27 Sep 2025 12:18:16 +0800 Subject: [PATCH 332/759] Update template to 0.1.88 (#10046) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2980bebdd..b3f81e8fa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.26.13 -comfyui-workflow-templates==0.1.86 +comfyui-workflow-templates==0.1.88 comfyui-embedded-docs==0.2.6 torch torchsde From 255572188f79e5c58fa997bf73529021129459a9 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Fri, 26 Sep 2025 21:29:13 -0700 Subject: [PATCH 333/759] Add workflow templates version tracking to system_stats (#9089) Adds installed and required workflow templates version information to the /system_stats endpoint, allowing the frontend to detect and notify users when their templates package is outdated. - Add get_installed_templates_version() and get_required_templates_version() methods to FrontendManager - Include templates version info in system_stats response - Add comprehensive unit tests for the new functionality --- app/frontend_management.py | 33 +++++++++ server.py | 4 ++ tests-unit/app_test/frontend_manager_test.py | 71 ++++++++++++++++++++ 3 files changed, 108 insertions(+) diff --git a/app/frontend_management.py b/app/frontend_management.py index 0bee73685..cce0c117d 100644 --- a/app/frontend_management.py +++ b/app/frontend_management.py @@ -42,6 +42,7 @@ def get_installed_frontend_version(): frontend_version_str = version("comfyui-frontend-package") return frontend_version_str + def get_required_frontend_version(): """Get the required frontend version from requirements.txt.""" try: @@ -63,6 +64,7 @@ def get_required_frontend_version(): logging.error(f"Error reading requirements.txt: {e}") return None + def check_frontend_version(): """Check if the frontend version is up to date.""" @@ -203,6 +205,37 @@ class FrontendManager: """Get the required frontend package version.""" return get_required_frontend_version() + @classmethod + def get_installed_templates_version(cls) -> str: + """Get the currently installed workflow templates package version.""" + try: + templates_version_str = version("comfyui-workflow-templates") + return templates_version_str + except Exception: + return None + + @classmethod + def get_required_templates_version(cls) -> str: + """Get the required workflow templates version from requirements.txt.""" + try: + with open(requirements_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line.startswith("comfyui-workflow-templates=="): + version_str = line.split("==")[-1] + if not is_valid_version(version_str): + logging.error(f"Invalid templates version format in requirements.txt: {version_str}") + return None + return version_str + logging.error("comfyui-workflow-templates not found in requirements.txt") + return None + except FileNotFoundError: + logging.error("requirements.txt not found. Cannot determine required templates version.") + return None + except Exception as e: + logging.error(f"Error reading requirements.txt: {e}") + return None + @classmethod def default_frontend_path(cls) -> str: try: diff --git a/server.py b/server.py index 603677397..80e9d3fa7 100644 --- a/server.py +++ b/server.py @@ -550,6 +550,8 @@ class PromptServer(): vram_total, torch_vram_total = comfy.model_management.get_total_memory(device, torch_total_too=True) vram_free, torch_vram_free = comfy.model_management.get_free_memory(device, torch_free_too=True) required_frontend_version = FrontendManager.get_required_frontend_version() + installed_templates_version = FrontendManager.get_installed_templates_version() + required_templates_version = FrontendManager.get_required_templates_version() system_stats = { "system": { @@ -558,6 +560,8 @@ class PromptServer(): "ram_free": ram_free, "comfyui_version": __version__, "required_frontend_version": required_frontend_version, + "installed_templates_version": installed_templates_version, + "required_templates_version": required_templates_version, "python_version": sys.version, "pytorch_version": comfy.model_management.torch_version, "embedded_python": os.path.split(os.path.split(sys.executable)[0])[1] == "python_embeded", diff --git a/tests-unit/app_test/frontend_manager_test.py b/tests-unit/app_test/frontend_manager_test.py index ce43ac564..643f04e72 100644 --- a/tests-unit/app_test/frontend_manager_test.py +++ b/tests-unit/app_test/frontend_manager_test.py @@ -205,3 +205,74 @@ numpy""" # Assert assert version is None + + +def test_get_templates_version(): + # Arrange + expected_version = "0.1.41" + mock_requirements_content = """torch +torchsde +comfyui-frontend-package==1.25.0 +comfyui-workflow-templates==0.1.41 +other-package==1.0.0 +numpy""" + + # Act + with patch("builtins.open", mock_open(read_data=mock_requirements_content)): + version = FrontendManager.get_required_templates_version() + + # Assert + assert version == expected_version + + +def test_get_templates_version_not_found(): + # Arrange + mock_requirements_content = """torch +torchsde +comfyui-frontend-package==1.25.0 +other-package==1.0.0 +numpy""" + + # Act + with patch("builtins.open", mock_open(read_data=mock_requirements_content)): + version = FrontendManager.get_required_templates_version() + + # Assert + assert version is None + + +def test_get_templates_version_invalid_semver(): + # Arrange + mock_requirements_content = """torch +torchsde +comfyui-workflow-templates==1.0.0.beta +other-package==1.0.0 +numpy""" + + # Act + with patch("builtins.open", mock_open(read_data=mock_requirements_content)): + version = FrontendManager.get_required_templates_version() + + # Assert + assert version is None + + +def test_get_installed_templates_version(): + # Arrange + expected_version = "0.1.40" + + # Act + with patch("app.frontend_management.version", return_value=expected_version): + version = FrontendManager.get_installed_templates_version() + + # Assert + assert version == expected_version + + +def test_get_installed_templates_version_not_installed(): + # Act + with patch("app.frontend_management.version", side_effect=Exception("Package not found")): + version = FrontendManager.get_installed_templates_version() + + # Assert + assert version is None From a9cf1cd249773632949bec2262f921f64378127f Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 09:13:05 +0300 Subject: [PATCH 334/759] convert nodes_hidream.py to V3 schema (#9946) --- comfy_extras/nodes_hidream.py | 88 +++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 35 deletions(-) diff --git a/comfy_extras/nodes_hidream.py b/comfy_extras/nodes_hidream.py index dfb98597b..eee683ee1 100644 --- a/comfy_extras/nodes_hidream.py +++ b/comfy_extras/nodes_hidream.py @@ -1,55 +1,73 @@ +from typing_extensions import override + import folder_paths import comfy.sd import comfy.model_management +from comfy_api.latest import ComfyExtension, io -class QuadrupleCLIPLoader: +class QuadrupleCLIPLoader(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ), - "clip_name2": (folder_paths.get_filename_list("text_encoders"), ), - "clip_name3": (folder_paths.get_filename_list("text_encoders"), ), - "clip_name4": (folder_paths.get_filename_list("text_encoders"), ) - }} - RETURN_TYPES = ("CLIP",) - FUNCTION = "load_clip" + def define_schema(cls): + return io.Schema( + node_id="QuadrupleCLIPLoader", + category="advanced/loaders", + description="[Recipes]\n\nhidream: long clip-l, long clip-g, t5xxl, llama_8b_3.1_instruct", + inputs=[ + io.Combo.Input("clip_name1", options=folder_paths.get_filename_list("text_encoders")), + io.Combo.Input("clip_name2", options=folder_paths.get_filename_list("text_encoders")), + io.Combo.Input("clip_name3", options=folder_paths.get_filename_list("text_encoders")), + io.Combo.Input("clip_name4", options=folder_paths.get_filename_list("text_encoders")), + ], + outputs=[ + io.Clip.Output(), + ] + ) - CATEGORY = "advanced/loaders" - - DESCRIPTION = "[Recipes]\n\nhidream: long clip-l, long clip-g, t5xxl, llama_8b_3.1_instruct" - - def load_clip(self, clip_name1, clip_name2, clip_name3, clip_name4): + @classmethod + def execute(cls, clip_name1, clip_name2, clip_name3, clip_name4): clip_path1 = folder_paths.get_full_path_or_raise("text_encoders", clip_name1) clip_path2 = folder_paths.get_full_path_or_raise("text_encoders", clip_name2) clip_path3 = folder_paths.get_full_path_or_raise("text_encoders", clip_name3) clip_path4 = folder_paths.get_full_path_or_raise("text_encoders", clip_name4) clip = comfy.sd.load_clip(ckpt_paths=[clip_path1, clip_path2, clip_path3, clip_path4], embedding_directory=folder_paths.get_folder_paths("embeddings")) - return (clip,) + return io.NodeOutput(clip) -class CLIPTextEncodeHiDream: +class CLIPTextEncodeHiDream(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "clip_l": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "clip_g": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "t5xxl": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "llama": ("STRING", {"multiline": True, "dynamicPrompts": True}) - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" - - CATEGORY = "advanced/conditioning" - - def encode(self, clip, clip_l, clip_g, t5xxl, llama): + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodeHiDream", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("clip_l", multiline=True, dynamic_prompts=True), + io.String.Input("clip_g", multiline=True, dynamic_prompts=True), + io.String.Input("t5xxl", multiline=True, dynamic_prompts=True), + io.String.Input("llama", multiline=True, dynamic_prompts=True), + ], + outputs=[ + io.Conditioning.Output(), + ] + ) + @classmethod + def execute(cls, clip, clip_l, clip_g, t5xxl, llama): tokens = clip.tokenize(clip_g) tokens["l"] = clip.tokenize(clip_l)["l"] tokens["t5xxl"] = clip.tokenize(t5xxl)["t5xxl"] tokens["llama"] = clip.tokenize(llama)["llama"] - return (clip.encode_from_tokens_scheduled(tokens), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens)) -NODE_CLASS_MAPPINGS = { - "QuadrupleCLIPLoader": QuadrupleCLIPLoader, - "CLIPTextEncodeHiDream": CLIPTextEncodeHiDream, -} + +class HiDreamExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + QuadrupleCLIPLoader, + CLIPTextEncodeHiDream, + ] + + +async def comfy_entrypoint() -> HiDreamExtension: + return HiDreamExtension() From 6b4b671ce7b6c412c2db9f9f83ff8e27dbcfd959 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 12:27:01 +0300 Subject: [PATCH 335/759] convert nodes_bfl.py to V3 schema (#10033) --- comfy_api_nodes/nodes_bfl.py | 1056 ++++++++++++++++------------------ 1 file changed, 489 insertions(+), 567 deletions(-) diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py index c09be8d5b..77914021d 100644 --- a/comfy_api_nodes/nodes_bfl.py +++ b/comfy_api_nodes/nodes_bfl.py @@ -2,7 +2,8 @@ import asyncio import io from inspect import cleandoc from typing import Union, Optional -from comfy.comfy_types.node_typing import IO, ComfyNodeABC +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io as comfy_io from comfy_api_nodes.apis.bfl_api import ( BFLStatus, BFLFluxExpandImageRequest, @@ -130,7 +131,7 @@ def convert_image_to_base64(image: torch.Tensor): return base64.b64encode(img_byte_arr.getvalue()).decode() -class FluxProUltraImageNode(ComfyNodeABC): +class FluxProUltraImageNode(comfy_io.ComfyNode): """ Generates images using Flux Pro 1.1 Ultra via api based on prompt and resolution. """ @@ -141,71 +142,67 @@ class FluxProUltraImageNode(ComfyNodeABC): MAXIMUM_RATIO_STR = "4:1" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="FluxProUltraImageNode", + display_name="Flux 1.1 [pro] Ultra Image", + category="api node/image/BFL", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, + comfy_io.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", ), - "aspect_ratio": ( - IO.STRING, - { - "default": "16:9", - "tooltip": "Aspect ratio of image; must be between 1:4 and 4:1.", - }, + comfy_io.String.Input( + "aspect_ratio", + default="16:9", + tooltip="Aspect ratio of image; must be between 1:4 and 4:1.", ), - "raw": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "When True, generate less processed, more natural-looking images.", - }, + comfy_io.Boolean.Input( + "raw", + default=False, + tooltip="When True, generate less processed, more natural-looking images.", ), - }, - "optional": { - "image_prompt": (IO.IMAGE,), - "image_prompt_strength": ( - IO.FLOAT, - { - "default": 0.1, - "min": 0.0, - "max": 1.0, - "step": 0.01, - "tooltip": "Blend between the prompt and the image prompt.", - }, + comfy_io.Image.Input( + "image_prompt", + optional=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + comfy_io.Float.Input( + "image_prompt_strength", + default=0.1, + min=0.0, + max=1.0, + step=0.01, + tooltip="Blend between the prompt and the image prompt.", + optional=True, + ), + ], + outputs=[comfy_io.Image.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def VALIDATE_INPUTS(cls, aspect_ratio: str): + def validate_inputs(cls, aspect_ratio: str): try: validate_aspect_ratio( aspect_ratio, @@ -218,14 +215,9 @@ class FluxProUltraImageNode(ComfyNodeABC): return str(e) return True - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, aspect_ratio: str, prompt_upsampling=False, @@ -233,9 +225,7 @@ class FluxProUltraImageNode(ComfyNodeABC): seed=0, image_prompt=None, image_prompt_strength=0.1, - unique_id: Union[str, None] = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: if image_prompt is None: validate_string(prompt, strip_whitespace=False) operation = SynchronousOperation( @@ -251,10 +241,10 @@ class FluxProUltraImageNode(ComfyNodeABC): seed=seed, aspect_ratio=validate_aspect_ratio( aspect_ratio, - minimum_ratio=self.MINIMUM_RATIO, - maximum_ratio=self.MAXIMUM_RATIO, - minimum_ratio_str=self.MINIMUM_RATIO_STR, - maximum_ratio_str=self.MAXIMUM_RATIO_STR, + minimum_ratio=cls.MINIMUM_RATIO, + maximum_ratio=cls.MAXIMUM_RATIO, + minimum_ratio_str=cls.MINIMUM_RATIO_STR, + maximum_ratio_str=cls.MAXIMUM_RATIO_STR, ), raw=raw, image_prompt=( @@ -266,13 +256,16 @@ class FluxProUltraImageNode(ComfyNodeABC): None if image_prompt is None else round(image_prompt_strength, 2) ), ), - auth_kwargs=kwargs, + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) + output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) + return comfy_io.NodeOutput(output_image) -class FluxKontextProImageNode(ComfyNodeABC): +class FluxKontextProImageNode(comfy_io.ComfyNode): """ Edits images using Flux.1 Kontext [pro] via api based on prompt and aspect ratio. """ @@ -283,81 +276,73 @@ class FluxKontextProImageNode(ComfyNodeABC): MAXIMUM_RATIO_STR = "4:1" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation - specify what and how to edit.", - }, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id=cls.NODE_ID, + display_name=cls.DISPLAY_NAME, + category="api node/image/BFL", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation - specify what and how to edit.", ), - "aspect_ratio": ( - IO.STRING, - { - "default": "16:9", - "tooltip": "Aspect ratio of image; must be between 1:4 and 4:1.", - }, + comfy_io.String.Input( + "aspect_ratio", + default="16:9", + tooltip="Aspect ratio of image; must be between 1:4 and 4:1.", ), - "guidance": ( - IO.FLOAT, - { - "default": 3.0, - "min": 0.1, - "max": 99.0, - "step": 0.1, - "tooltip": "Guidance strength for the image generation process" - }, + comfy_io.Float.Input( + "guidance", + default=3.0, + min=0.1, + max=99.0, + step=0.1, + tooltip="Guidance strength for the image generation process", ), - "steps": ( - IO.INT, - { - "default": 50, - "min": 1, - "max": 150, - "tooltip": "Number of steps for the image generation process" - }, + comfy_io.Int.Input( + "steps", + default=50, + min=1, + max=150, + tooltip="Number of steps for the image generation process", ), - "seed": ( - IO.INT, - { - "default": 1234, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + comfy_io.Int.Input( + "seed", + default=1234, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, + comfy_io.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - }, - "optional": { - "input_image": (IO.IMAGE,), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" + comfy_io.Image.Input( + "input_image", + optional=True, + ), + ], + outputs=[comfy_io.Image.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) BFL_PATH = "/proxy/bfl/flux-kontext-pro/generate" + NODE_ID = "FluxKontextProImageNode" + DISPLAY_NAME = "Flux.1 Kontext [pro] Image" - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, aspect_ratio: str, guidance: float, @@ -365,21 +350,19 @@ class FluxKontextProImageNode(ComfyNodeABC): input_image: Optional[torch.Tensor]=None, seed=0, prompt_upsampling=False, - unique_id: Union[str, None] = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: aspect_ratio = validate_aspect_ratio( aspect_ratio, - minimum_ratio=self.MINIMUM_RATIO, - maximum_ratio=self.MAXIMUM_RATIO, - minimum_ratio_str=self.MINIMUM_RATIO_STR, - maximum_ratio_str=self.MAXIMUM_RATIO_STR, + minimum_ratio=cls.MINIMUM_RATIO, + maximum_ratio=cls.MAXIMUM_RATIO, + minimum_ratio_str=cls.MINIMUM_RATIO_STR, + maximum_ratio_str=cls.MAXIMUM_RATIO_STR, ) if input_image is None: validate_string(prompt, strip_whitespace=False) operation = SynchronousOperation( endpoint=ApiEndpoint( - path=self.BFL_PATH, + path=cls.BFL_PATH, method=HttpMethod.POST, request_model=BFLFluxKontextProGenerateRequest, response_model=BFLFluxProGenerateResponse, @@ -397,10 +380,13 @@ class FluxKontextProImageNode(ComfyNodeABC): else convert_image_to_base64(input_image) ) ), - auth_kwargs=kwargs, + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) + output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) + return comfy_io.NodeOutput(output_image) class FluxKontextMaxImageNode(FluxKontextProImageNode): @@ -410,63 +396,60 @@ class FluxKontextMaxImageNode(FluxKontextProImageNode): DESCRIPTION = cleandoc(__doc__ or "") BFL_PATH = "/proxy/bfl/flux-kontext-max/generate" + NODE_ID = "FluxKontextMaxImageNode" + DISPLAY_NAME = "Flux.1 Kontext [max] Image" -class FluxProImageNode(ComfyNodeABC): +class FluxProImageNode(comfy_io.ComfyNode): """ Generates images synchronously based on prompt and resolution. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="FluxProImageNode", + display_name="Flux 1.1 [pro] Image", + category="api node/image/BFL", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, + comfy_io.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - "width": ( - IO.INT, - { - "default": 1024, - "min": 256, - "max": 1440, - "step": 32, - }, + comfy_io.Int.Input( + "width", + default=1024, + min=256, + max=1440, + step=32, ), - "height": ( - IO.INT, - { - "default": 768, - "min": 256, - "max": 1440, - "step": 32, - }, + comfy_io.Int.Input( + "height", + default=768, + min=256, + max=1440, + step=32, ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + ), + comfy_io.Image.Input( + "image_prompt", + optional=True, ), - }, - "optional": { - "image_prompt": (IO.IMAGE,), # "image_prompt_strength": ( # IO.FLOAT, # { @@ -477,22 +460,19 @@ class FluxProImageNode(ComfyNodeABC): # "tooltip": "Blend between the prompt and the image prompt.", # }, # ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[comfy_io.Image.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, prompt_upsampling, width: int, @@ -500,9 +480,7 @@ class FluxProImageNode(ComfyNodeABC): seed=0, image_prompt=None, # image_prompt_strength=0.1, - unique_id: Union[str, None] = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: image_prompt = ( image_prompt if image_prompt is None @@ -524,118 +502,103 @@ class FluxProImageNode(ComfyNodeABC): seed=seed, image_prompt=image_prompt, ), - auth_kwargs=kwargs, + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) + output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) + return comfy_io.NodeOutput(output_image) -class FluxProExpandNode(ComfyNodeABC): +class FluxProExpandNode(comfy_io.ComfyNode): """ Outpaints image based on prompt. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="FluxProExpandNode", + display_name="Flux.1 Expand Image", + category="api node/image/BFL", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("image"), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, + comfy_io.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - "top": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2048, - "tooltip": "Number of pixels to expand at the top of the image" - }, + comfy_io.Int.Input( + "top", + default=0, + min=0, + max=2048, + tooltip="Number of pixels to expand at the top of the image", ), - "bottom": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2048, - "tooltip": "Number of pixels to expand at the bottom of the image" - }, + comfy_io.Int.Input( + "bottom", + default=0, + min=0, + max=2048, + tooltip="Number of pixels to expand at the bottom of the image", ), - "left": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2048, - "tooltip": "Number of pixels to expand at the left side of the image" - }, + comfy_io.Int.Input( + "left", + default=0, + min=0, + max=2048, + tooltip="Number of pixels to expand at the left of the image", ), - "right": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2048, - "tooltip": "Number of pixels to expand at the right side of the image" - }, + comfy_io.Int.Input( + "right", + default=0, + min=0, + max=2048, + tooltip="Number of pixels to expand at the right of the image", ), - "guidance": ( - IO.FLOAT, - { - "default": 60, - "min": 1.5, - "max": 100, - "tooltip": "Guidance strength for the image generation process" - }, + comfy_io.Float.Input( + "guidance", + default=60, + min=1.5, + max=100, + tooltip="Guidance strength for the image generation process", ), - "steps": ( - IO.INT, - { - "default": 50, - "min": 15, - "max": 50, - "tooltip": "Number of steps for the image generation process" - }, + comfy_io.Int.Input( + "steps", + default=50, + min=15, + max=50, + tooltip="Number of steps for the image generation process", ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", ), - }, - "optional": {}, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[comfy_io.Image.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" - - async def api_call( - self, + @classmethod + async def execute( + cls, image: torch.Tensor, prompt: str, prompt_upsampling: bool, @@ -646,9 +609,7 @@ class FluxProExpandNode(ComfyNodeABC): steps: int, guidance: float, seed=0, - unique_id: Union[str, None] = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: image = convert_image_to_base64(image) operation = SynchronousOperation( @@ -670,84 +631,77 @@ class FluxProExpandNode(ComfyNodeABC): seed=seed, image=image, ), - auth_kwargs=kwargs, + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) + output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) + return comfy_io.NodeOutput(output_image) -class FluxProFillNode(ComfyNodeABC): +class FluxProFillNode(comfy_io.ComfyNode): """ Inpaints image based on mask and prompt. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - "mask": (IO.MASK,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="FluxProFillNode", + display_name="Flux.1 Fill Image", + category="api node/image/BFL", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("image"), + comfy_io.Mask.Input("mask"), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, + comfy_io.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - "guidance": ( - IO.FLOAT, - { - "default": 60, - "min": 1.5, - "max": 100, - "tooltip": "Guidance strength for the image generation process" - }, + comfy_io.Float.Input( + "guidance", + default=60, + min=1.5, + max=100, + tooltip="Guidance strength for the image generation process", ), - "steps": ( - IO.INT, - { - "default": 50, - "min": 15, - "max": 50, - "tooltip": "Number of steps for the image generation process" - }, + comfy_io.Int.Input( + "steps", + default=50, + min=15, + max=50, + tooltip="Number of steps for the image generation process", ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", ), - }, - "optional": {}, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[comfy_io.Image.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" - - async def api_call( - self, + @classmethod + async def execute( + cls, image: torch.Tensor, mask: torch.Tensor, prompt: str, @@ -755,9 +709,7 @@ class FluxProFillNode(ComfyNodeABC): steps: int, guidance: float, seed=0, - unique_id: Union[str, None] = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: # prepare mask mask = resize_mask_to_image(mask, image) mask = convert_image_to_base64(convert_mask_to_image(mask)) @@ -780,109 +732,96 @@ class FluxProFillNode(ComfyNodeABC): image=image, mask=mask, ), - auth_kwargs=kwargs, + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) + output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) + return comfy_io.NodeOutput(output_image) -class FluxProCannyNode(ComfyNodeABC): +class FluxProCannyNode(comfy_io.ComfyNode): """ Generate image using a control image (canny). """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "control_image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="FluxProCannyNode", + display_name="Flux.1 Canny Control Image", + category="api node/image/BFL", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("control_image"), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, + comfy_io.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - "canny_low_threshold": ( - IO.FLOAT, - { - "default": 0.1, - "min": 0.01, - "max": 0.99, - "step": 0.01, - "tooltip": "Low threshold for Canny edge detection; ignored if skip_processing is True" - }, + comfy_io.Float.Input( + "canny_low_threshold", + default=0.1, + min=0.01, + max=0.99, + step=0.01, + tooltip="Low threshold for Canny edge detection; ignored if skip_processing is True", ), - "canny_high_threshold": ( - IO.FLOAT, - { - "default": 0.4, - "min": 0.01, - "max": 0.99, - "step": 0.01, - "tooltip": "High threshold for Canny edge detection; ignored if skip_processing is True" - }, + comfy_io.Float.Input( + "canny_high_threshold", + default=0.4, + min=0.01, + max=0.99, + step=0.01, + tooltip="High threshold for Canny edge detection; ignored if skip_processing is True", ), - "skip_preprocessing": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to skip preprocessing; set to True if control_image already is canny-fied, False if it is a raw image.", - }, + comfy_io.Boolean.Input( + "skip_preprocessing", + default=False, + tooltip="Whether to skip preprocessing; set to True if control_image already is canny-fied, False if it is a raw image.", ), - "guidance": ( - IO.FLOAT, - { - "default": 30, - "min": 1, - "max": 100, - "tooltip": "Guidance strength for the image generation process" - }, + comfy_io.Float.Input( + "guidance", + default=30, + min=1, + max=100, + tooltip="Guidance strength for the image generation process", ), - "steps": ( - IO.INT, - { - "default": 50, - "min": 15, - "max": 50, - "tooltip": "Number of steps for the image generation process" - }, + comfy_io.Int.Input( + "steps", + default=50, + min=15, + max=50, + tooltip="Number of steps for the image generation process", ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", ), - }, - "optional": {}, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[comfy_io.Image.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" - - async def api_call( - self, + @classmethod + async def execute( + cls, control_image: torch.Tensor, prompt: str, prompt_upsampling: bool, @@ -892,9 +831,7 @@ class FluxProCannyNode(ComfyNodeABC): steps: int, guidance: float, seed=0, - unique_id: Union[str, None] = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: control_image = convert_image_to_base64(control_image[:, :, :, :3]) preprocessed_image = None @@ -929,89 +866,80 @@ class FluxProCannyNode(ComfyNodeABC): canny_high_threshold=canny_high_threshold, preprocessed_image=preprocessed_image, ), - auth_kwargs=kwargs, + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) + output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) + return comfy_io.NodeOutput(output_image) -class FluxProDepthNode(ComfyNodeABC): +class FluxProDepthNode(comfy_io.ComfyNode): """ Generate image using a control image (depth). """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "control_image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="FluxProDepthNode", + display_name="Flux.1 Depth Control Image", + category="api node/image/BFL", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("control_image"), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, + comfy_io.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - "skip_preprocessing": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to skip preprocessing; set to True if control_image already is depth-ified, False if it is a raw image.", - }, + comfy_io.Boolean.Input( + "skip_preprocessing", + default=False, + tooltip="Whether to skip preprocessing; set to True if control_image already is depth-ified, False if it is a raw image.", ), - "guidance": ( - IO.FLOAT, - { - "default": 15, - "min": 1, - "max": 100, - "tooltip": "Guidance strength for the image generation process" - }, + comfy_io.Float.Input( + "guidance", + default=15, + min=1, + max=100, + tooltip="Guidance strength for the image generation process", ), - "steps": ( - IO.INT, - { - "default": 50, - "min": 15, - "max": 50, - "tooltip": "Number of steps for the image generation process" - }, + comfy_io.Int.Input( + "steps", + default=50, + min=15, + max=50, + tooltip="Number of steps for the image generation process", ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", ), - }, - "optional": {}, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[comfy_io.Image.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" - - async def api_call( - self, + @classmethod + async def execute( + cls, control_image: torch.Tensor, prompt: str, prompt_upsampling: bool, @@ -1019,9 +947,7 @@ class FluxProDepthNode(ComfyNodeABC): steps: int, guidance: float, seed=0, - unique_id: Union[str, None] = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: control_image = convert_image_to_base64(control_image[:,:,:,:3]) preprocessed_image = None @@ -1045,33 +971,29 @@ class FluxProDepthNode(ComfyNodeABC): control_image=control_image, preprocessed_image=preprocessed_image, ), - auth_kwargs=kwargs, + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) + output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) + return comfy_io.NodeOutput(output_image) -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "FluxProUltraImageNode": FluxProUltraImageNode, - # "FluxProImageNode": FluxProImageNode, - "FluxKontextProImageNode": FluxKontextProImageNode, - "FluxKontextMaxImageNode": FluxKontextMaxImageNode, - "FluxProExpandNode": FluxProExpandNode, - "FluxProFillNode": FluxProFillNode, - "FluxProCannyNode": FluxProCannyNode, - "FluxProDepthNode": FluxProDepthNode, -} +class BFLExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + FluxProUltraImageNode, + # FluxProImageNode, + FluxKontextProImageNode, + FluxKontextMaxImageNode, + FluxProExpandNode, + FluxProFillNode, + FluxProCannyNode, + FluxProDepthNode, + ] -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "FluxProUltraImageNode": "Flux 1.1 [pro] Ultra Image", - # "FluxProImageNode": "Flux 1.1 [pro] Image", - "FluxKontextProImageNode": "Flux.1 Kontext [pro] Image", - "FluxKontextMaxImageNode": "Flux.1 Kontext [max] Image", - "FluxProExpandNode": "Flux.1 Expand Image", - "FluxProFillNode": "Flux.1 Fill Image", - "FluxProCannyNode": "Flux.1 Canny Control Image", - "FluxProDepthNode": "Flux.1 Depth Control Image", -} + +async def comfy_entrypoint() -> BFLExtension: + return BFLExtension() From bcfd80dd79ccfa77a7da69380795fbb55b65b1ba Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 12:28:11 +0300 Subject: [PATCH 336/759] convert nodes_luma.py to V3 schema (#10030) --- comfy_api_nodes/nodes_luma.py | 774 +++++++++++++++++----------------- 1 file changed, 396 insertions(+), 378 deletions(-) diff --git a/comfy_api_nodes/nodes_luma.py b/comfy_api_nodes/nodes_luma.py index b3c32bed5..9cd02ffd2 100644 --- a/comfy_api_nodes/nodes_luma.py +++ b/comfy_api_nodes/nodes_luma.py @@ -1,7 +1,8 @@ from __future__ import annotations from inspect import cleandoc from typing import Optional -from comfy.comfy_types.node_typing import IO, ComfyNodeABC +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io as comfy_io from comfy_api.input_impl.video_types import VideoFromFile from comfy_api_nodes.apis.luma_api import ( LumaImageModel, @@ -51,174 +52,186 @@ def image_result_url_extractor(response: LumaGeneration): def video_result_url_extractor(response: LumaGeneration): return response.assets.video if hasattr(response, "assets") and hasattr(response.assets, "video") else None -class LumaReferenceNode(ComfyNodeABC): +class LumaReferenceNode(comfy_io.ComfyNode): """ Holds an image and weight for use with Luma Generate Image node. """ - RETURN_TYPES = (LumaIO.LUMA_REF,) - RETURN_NAMES = ("luma_ref",) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "create_luma_reference" - CATEGORY = "api node/image/Luma" + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="LumaReferenceNode", + display_name="Luma Reference", + category="api node/image/Luma", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input( + "image", + tooltip="Image to use as reference.", + ), + comfy_io.Float.Input( + "weight", + default=1.0, + min=0.0, + max=1.0, + step=0.01, + tooltip="Weight of image reference.", + ), + comfy_io.Custom(LumaIO.LUMA_REF).Input( + "luma_ref", + optional=True, + ), + ], + outputs=[comfy_io.Custom(LumaIO.LUMA_REF).Output(display_name="luma_ref")], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ( - IO.IMAGE, - { - "tooltip": "Image to use as reference.", - }, - ), - "weight": ( - IO.FLOAT, - { - "default": 1.0, - "min": 0.0, - "max": 1.0, - "step": 0.01, - "tooltip": "Weight of image reference.", - }, - ), - }, - "optional": {"luma_ref": (LumaIO.LUMA_REF,)}, - } - - def create_luma_reference( - self, image: torch.Tensor, weight: float, luma_ref: LumaReferenceChain = None - ): + def execute( + cls, image: torch.Tensor, weight: float, luma_ref: LumaReferenceChain = None + ) -> comfy_io.NodeOutput: if luma_ref is not None: luma_ref = luma_ref.clone() else: luma_ref = LumaReferenceChain() luma_ref.add(LumaReference(image=image, weight=round(weight, 2))) - return (luma_ref,) + return comfy_io.NodeOutput(luma_ref) -class LumaConceptsNode(ComfyNodeABC): +class LumaConceptsNode(comfy_io.ComfyNode): """ Holds one or more Camera Concepts for use with Luma Text to Video and Luma Image to Video nodes. """ - RETURN_TYPES = (LumaIO.LUMA_CONCEPTS,) - RETURN_NAMES = ("luma_concepts",) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "create_concepts" - CATEGORY = "api node/video/Luma" + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="LumaConceptsNode", + display_name="Luma Concepts", + category="api node/video/Luma", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Combo.Input( + "concept1", + options=get_luma_concepts(include_none=True), + ), + comfy_io.Combo.Input( + "concept2", + options=get_luma_concepts(include_none=True), + ), + comfy_io.Combo.Input( + "concept3", + options=get_luma_concepts(include_none=True), + ), + comfy_io.Combo.Input( + "concept4", + options=get_luma_concepts(include_none=True), + ), + comfy_io.Custom(LumaIO.LUMA_CONCEPTS).Input( + "luma_concepts", + tooltip="Optional Camera Concepts to add to the ones chosen here.", + optional=True, + ), + ], + outputs=[comfy_io.Custom(LumaIO.LUMA_CONCEPTS).Output(display_name="luma_concepts")], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "concept1": (get_luma_concepts(include_none=True),), - "concept2": (get_luma_concepts(include_none=True),), - "concept3": (get_luma_concepts(include_none=True),), - "concept4": (get_luma_concepts(include_none=True),), - }, - "optional": { - "luma_concepts": ( - LumaIO.LUMA_CONCEPTS, - { - "tooltip": "Optional Camera Concepts to add to the ones chosen here." - }, - ), - }, - } - - def create_concepts( - self, + def execute( + cls, concept1: str, concept2: str, concept3: str, concept4: str, luma_concepts: LumaConceptChain = None, - ): + ) -> comfy_io.NodeOutput: chain = LumaConceptChain(str_list=[concept1, concept2, concept3, concept4]) if luma_concepts is not None: chain = luma_concepts.clone_and_merge(chain) - return (chain,) + return comfy_io.NodeOutput(chain) -class LumaImageGenerationNode(ComfyNodeABC): +class LumaImageGenerationNode(comfy_io.ComfyNode): """ Generates images synchronously based on prompt and aspect ratio. """ - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Luma" + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="LumaImageNode", + display_name="Luma Text to Image", + category="api node/image/Luma", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", + ), + comfy_io.Combo.Input( + "model", + options=[model.value for model in LumaImageModel], + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=[ratio.value for ratio in LumaAspectRatio], + default=LumaAspectRatio.ratio_16_9, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", + ), + comfy_io.Float.Input( + "style_image_weight", + default=1.0, + min=0.0, + max=1.0, + step=0.01, + tooltip="Weight of style image. Ignored if no style_image provided.", + ), + comfy_io.Custom(LumaIO.LUMA_REF).Input( + "image_luma_ref", + tooltip="Luma Reference node connection to influence generation with input images; up to 4 images can be considered.", + optional=True, + ), + comfy_io.Image.Input( + "style_image", + tooltip="Style reference image; only 1 image will be used.", + optional=True, + ), + comfy_io.Image.Input( + "character_image", + tooltip="Character reference images; can be a batch of multiple, up to 4 images can be considered.", + optional=True, + ), + ], + outputs=[comfy_io.Image.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, - ), - "model": ([model.value for model in LumaImageModel],), - "aspect_ratio": ( - [ratio.value for ratio in LumaAspectRatio], - { - "default": LumaAspectRatio.ratio_16_9, - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - "style_image_weight": ( - IO.FLOAT, - { - "default": 1.0, - "min": 0.0, - "max": 1.0, - "step": 0.01, - "tooltip": "Weight of style image. Ignored if no style_image provided.", - }, - ), - }, - "optional": { - "image_luma_ref": ( - LumaIO.LUMA_REF, - { - "tooltip": "Luma Reference node connection to influence generation with input images; up to 4 images can be considered." - }, - ), - "style_image": ( - IO.IMAGE, - {"tooltip": "Style reference image; only 1 image will be used."}, - ), - "character_image": ( - IO.IMAGE, - { - "tooltip": "Character reference images; can be a batch of multiple, up to 4 images can be considered." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, model: str, aspect_ratio: str, @@ -227,27 +240,29 @@ class LumaImageGenerationNode(ComfyNodeABC): image_luma_ref: LumaReferenceChain = None, style_image: torch.Tensor = None, character_image: torch.Tensor = None, - unique_id: str = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=3) + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } # handle image_luma_ref api_image_ref = None if image_luma_ref is not None: - api_image_ref = await self._convert_luma_refs( - image_luma_ref, max_refs=4, auth_kwargs=kwargs, + api_image_ref = await cls._convert_luma_refs( + image_luma_ref, max_refs=4, auth_kwargs=auth_kwargs, ) # handle style_luma_ref api_style_ref = None if style_image is not None: - api_style_ref = await self._convert_style_image( - style_image, weight=style_image_weight, auth_kwargs=kwargs, + api_style_ref = await cls._convert_style_image( + style_image, weight=style_image_weight, auth_kwargs=auth_kwargs, ) # handle character_ref images character_ref = None if character_image is not None: download_urls = await upload_images_to_comfyapi( - character_image, max_images=4, auth_kwargs=kwargs, + character_image, max_images=4, auth_kwargs=auth_kwargs, ) character_ref = LumaCharacterRef( identity0=LumaImageIdentity(images=download_urls) @@ -268,7 +283,7 @@ class LumaImageGenerationNode(ComfyNodeABC): style_ref=api_style_ref, character_ref=character_ref, ), - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) response_api: LumaGeneration = await operation.execute() @@ -283,18 +298,19 @@ class LumaImageGenerationNode(ComfyNodeABC): failed_statuses=[LumaState.failed], status_extractor=lambda x: x.state, result_url_extractor=image_result_url_extractor, - node_id=unique_id, - auth_kwargs=kwargs, + node_id=cls.hidden.unique_id, + auth_kwargs=auth_kwargs, ) response_poll = await operation.execute() async with aiohttp.ClientSession() as session: async with session.get(response_poll.assets.image) as img_response: img = process_image_response(await img_response.content.read()) - return (img,) + return comfy_io.NodeOutput(img) + @classmethod async def _convert_luma_refs( - self, luma_ref: LumaReferenceChain, max_refs: int, auth_kwargs: Optional[dict[str,str]] = None + cls, luma_ref: LumaReferenceChain, max_refs: int, auth_kwargs: Optional[dict[str,str]] = None ): luma_urls = [] ref_count = 0 @@ -308,82 +324,84 @@ class LumaImageGenerationNode(ComfyNodeABC): break return luma_ref.create_api_model(download_urls=luma_urls, max_refs=max_refs) + @classmethod async def _convert_style_image( - self, style_image: torch.Tensor, weight: float, auth_kwargs: Optional[dict[str,str]] = None + cls, style_image: torch.Tensor, weight: float, auth_kwargs: Optional[dict[str,str]] = None ): chain = LumaReferenceChain( first_ref=LumaReference(image=style_image, weight=weight) ) - return await self._convert_luma_refs(chain, max_refs=1, auth_kwargs=auth_kwargs) + return await cls._convert_luma_refs(chain, max_refs=1, auth_kwargs=auth_kwargs) -class LumaImageModifyNode(ComfyNodeABC): +class LumaImageModifyNode(comfy_io.ComfyNode): """ Modifies images synchronously based on prompt and aspect ratio. """ - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Luma" + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="LumaImageModifyNode", + display_name="Luma Image to Image", + category="api node/image/Luma", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input( + "image", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", + ), + comfy_io.Float.Input( + "image_weight", + default=0.1, + min=0.0, + max=0.98, + step=0.01, + tooltip="Weight of the image; the closer to 1.0, the less the image will be modified.", + ), + comfy_io.Combo.Input( + "model", + options=[model.value for model in LumaImageModel], + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", + ), + ], + outputs=[comfy_io.Image.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, - ), - "image_weight": ( - IO.FLOAT, - { - "default": 0.1, - "min": 0.0, - "max": 0.98, - "step": 0.01, - "tooltip": "Weight of the image; the closer to 1.0, the less the image will be modified.", - }, - ), - "model": ([model.value for model in LumaImageModel],), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": {}, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, model: str, image: torch.Tensor, image_weight: float, seed, - unique_id: str = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } # first, upload image download_urls = await upload_images_to_comfyapi( - image, max_images=1, auth_kwargs=kwargs, + image, max_images=1, auth_kwargs=auth_kwargs, ) image_url = download_urls[0] # next, make Luma call with download url provided @@ -401,7 +419,7 @@ class LumaImageModifyNode(ComfyNodeABC): url=image_url, weight=round(max(min(1.0-image_weight, 0.98), 0.0), 2) ), ), - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) response_api: LumaGeneration = await operation.execute() @@ -416,88 +434,84 @@ class LumaImageModifyNode(ComfyNodeABC): failed_statuses=[LumaState.failed], status_extractor=lambda x: x.state, result_url_extractor=image_result_url_extractor, - node_id=unique_id, - auth_kwargs=kwargs, + node_id=cls.hidden.unique_id, + auth_kwargs=auth_kwargs, ) response_poll = await operation.execute() async with aiohttp.ClientSession() as session: async with session.get(response_poll.assets.image) as img_response: img = process_image_response(await img_response.content.read()) - return (img,) + return comfy_io.NodeOutput(img) -class LumaTextToVideoGenerationNode(ComfyNodeABC): +class LumaTextToVideoGenerationNode(comfy_io.ComfyNode): """ Generates videos synchronously based on prompt and output_size. """ - RETURN_TYPES = (IO.VIDEO,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/video/Luma" + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="LumaVideoNode", + display_name="Luma Text to Video", + category="api node/video/Luma", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the video generation", + ), + comfy_io.Combo.Input( + "model", + options=[model.value for model in LumaVideoModel], + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=[ratio.value for ratio in LumaAspectRatio], + default=LumaAspectRatio.ratio_16_9, + ), + comfy_io.Combo.Input( + "resolution", + options=[resolution.value for resolution in LumaVideoOutputResolution], + default=LumaVideoOutputResolution.res_540p, + ), + comfy_io.Combo.Input( + "duration", + options=[dur.value for dur in LumaVideoModelOutputDuration], + ), + comfy_io.Boolean.Input( + "loop", + default=False, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", + ), + comfy_io.Custom(LumaIO.LUMA_CONCEPTS).Input( + "luma_concepts", + tooltip="Optional Camera Concepts to dictate camera motion via the Luma Concepts node.", + optional=True, + ) + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the video generation", - }, - ), - "model": ([model.value for model in LumaVideoModel],), - "aspect_ratio": ( - [ratio.value for ratio in LumaAspectRatio], - { - "default": LumaAspectRatio.ratio_16_9, - }, - ), - "resolution": ( - [resolution.value for resolution in LumaVideoOutputResolution], - { - "default": LumaVideoOutputResolution.res_540p, - }, - ), - "duration": ([dur.value for dur in LumaVideoModelOutputDuration],), - "loop": ( - IO.BOOLEAN, - { - "default": False, - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "luma_concepts": ( - LumaIO.LUMA_CONCEPTS, - { - "tooltip": "Optional Camera Concepts to dictate camera motion via the Luma Concepts node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, model: str, aspect_ratio: str, @@ -506,13 +520,15 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC): loop: bool, seed, luma_concepts: LumaConceptChain = None, - unique_id: str = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=False, min_length=3) duration = duration if model != LumaVideoModel.ray_1_6 else None resolution = resolution if model != LumaVideoModel.ray_1_6 else None + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/luma/generations", @@ -529,12 +545,12 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC): loop=loop, concepts=luma_concepts.create_api_model() if luma_concepts else None, ), - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) response_api: LumaGeneration = await operation.execute() - if unique_id: - PromptServer.instance.send_progress_text(f"Luma video generation started: {response_api.id}", unique_id) + if cls.hidden.unique_id: + PromptServer.instance.send_progress_text(f"Luma video generation started: {response_api.id}", cls.hidden.unique_id) operation = PollingOperation( poll_endpoint=ApiEndpoint( @@ -547,90 +563,94 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC): failed_statuses=[LumaState.failed], status_extractor=lambda x: x.state, result_url_extractor=video_result_url_extractor, - node_id=unique_id, + node_id=cls.hidden.unique_id, estimated_duration=LUMA_T2V_AVERAGE_DURATION, - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) response_poll = await operation.execute() async with aiohttp.ClientSession() as session: async with session.get(response_poll.assets.video) as vid_response: - return (VideoFromFile(BytesIO(await vid_response.content.read())),) + return comfy_io.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) -class LumaImageToVideoGenerationNode(ComfyNodeABC): +class LumaImageToVideoGenerationNode(comfy_io.ComfyNode): """ Generates videos synchronously based on prompt, input images, and output_size. """ - RETURN_TYPES = (IO.VIDEO,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/video/Luma" + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="LumaImageToVideoNode", + display_name="Luma Image to Video", + category="api node/video/Luma", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the video generation", + ), + comfy_io.Combo.Input( + "model", + options=[model.value for model in LumaVideoModel], + ), + # comfy_io.Combo.Input( + # "aspect_ratio", + # options=[ratio.value for ratio in LumaAspectRatio], + # default=LumaAspectRatio.ratio_16_9, + # ), + comfy_io.Combo.Input( + "resolution", + options=[resolution.value for resolution in LumaVideoOutputResolution], + default=LumaVideoOutputResolution.res_540p, + ), + comfy_io.Combo.Input( + "duration", + options=[dur.value for dur in LumaVideoModelOutputDuration], + ), + comfy_io.Boolean.Input( + "loop", + default=False, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", + ), + comfy_io.Image.Input( + "first_image", + tooltip="First frame of generated video.", + optional=True, + ), + comfy_io.Image.Input( + "last_image", + tooltip="Last frame of generated video.", + optional=True, + ), + comfy_io.Custom(LumaIO.LUMA_CONCEPTS).Input( + "luma_concepts", + tooltip="Optional Camera Concepts to dictate camera motion via the Luma Concepts node.", + optional=True, + ) + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the video generation", - }, - ), - "model": ([model.value for model in LumaVideoModel],), - # "aspect_ratio": ([ratio.value for ratio in LumaAspectRatio], { - # "default": LumaAspectRatio.ratio_16_9, - # }), - "resolution": ( - [resolution.value for resolution in LumaVideoOutputResolution], - { - "default": LumaVideoOutputResolution.res_540p, - }, - ), - "duration": ([dur.value for dur in LumaVideoModelOutputDuration],), - "loop": ( - IO.BOOLEAN, - { - "default": False, - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "first_image": ( - IO.IMAGE, - {"tooltip": "First frame of generated video."}, - ), - "last_image": (IO.IMAGE, {"tooltip": "Last frame of generated video."}), - "luma_concepts": ( - LumaIO.LUMA_CONCEPTS, - { - "tooltip": "Optional Camera Concepts to dictate camera motion via the Luma Concepts node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, model: str, resolution: str, @@ -640,14 +660,16 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): first_image: torch.Tensor = None, last_image: torch.Tensor = None, luma_concepts: LumaConceptChain = None, - unique_id: str = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: if first_image is None and last_image is None: raise Exception( "At least one of first_image and last_image requires an input." ) - keyframes = await self._convert_to_keyframes(first_image, last_image, auth_kwargs=kwargs) + auth_kwargs = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + keyframes = await cls._convert_to_keyframes(first_image, last_image, auth_kwargs=auth_kwargs) duration = duration if model != LumaVideoModel.ray_1_6 else None resolution = resolution if model != LumaVideoModel.ray_1_6 else None @@ -668,12 +690,12 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): keyframes=keyframes, concepts=luma_concepts.create_api_model() if luma_concepts else None, ), - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) response_api: LumaGeneration = await operation.execute() - if unique_id: - PromptServer.instance.send_progress_text(f"Luma video generation started: {response_api.id}", unique_id) + if cls.hidden.unique_id: + PromptServer.instance.send_progress_text(f"Luma video generation started: {response_api.id}", cls.hidden.unique_id) operation = PollingOperation( poll_endpoint=ApiEndpoint( @@ -686,18 +708,19 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): failed_statuses=[LumaState.failed], status_extractor=lambda x: x.state, result_url_extractor=video_result_url_extractor, - node_id=unique_id, + node_id=cls.hidden.unique_id, estimated_duration=LUMA_I2V_AVERAGE_DURATION, - auth_kwargs=kwargs, + auth_kwargs=auth_kwargs, ) response_poll = await operation.execute() async with aiohttp.ClientSession() as session: async with session.get(response_poll.assets.video) as vid_response: - return (VideoFromFile(BytesIO(await vid_response.content.read())),) + return comfy_io.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) + @classmethod async def _convert_to_keyframes( - self, + cls, first_image: torch.Tensor = None, last_image: torch.Tensor = None, auth_kwargs: Optional[dict[str,str]] = None, @@ -719,23 +742,18 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): return LumaKeyframes(frame0=frame0, frame1=frame1) -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "LumaImageNode": LumaImageGenerationNode, - "LumaImageModifyNode": LumaImageModifyNode, - "LumaVideoNode": LumaTextToVideoGenerationNode, - "LumaImageToVideoNode": LumaImageToVideoGenerationNode, - "LumaReferenceNode": LumaReferenceNode, - "LumaConceptsNode": LumaConceptsNode, -} +class LumaExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + LumaImageGenerationNode, + LumaImageModifyNode, + LumaTextToVideoGenerationNode, + LumaImageToVideoGenerationNode, + LumaReferenceNode, + LumaConceptsNode, + ] -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "LumaImageNode": "Luma Text to Image", - "LumaImageModifyNode": "Luma Image to Image", - "LumaVideoNode": "Luma Text to Video", - "LumaImageToVideoNode": "Luma Image to Video", - "LumaReferenceNode": "Luma Reference", - "LumaConceptsNode": "Luma Concepts", -} + +async def comfy_entrypoint() -> LumaExtension: + return LumaExtension() From ad5aef2d0c8517e971129db1dfb0d0108d8341a8 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 12:34:32 +0300 Subject: [PATCH 337/759] convert nodes_pixart.py to V3 schema (#10019) --- comfy_extras/nodes_pixart.py | 52 +++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/comfy_extras/nodes_pixart.py b/comfy_extras/nodes_pixart.py index 8d9276afe..a23e87b1f 100644 --- a/comfy_extras/nodes_pixart.py +++ b/comfy_extras/nodes_pixart.py @@ -1,24 +1,38 @@ -from nodes import MAX_RESOLUTION +from typing_extensions import override +import nodes +from comfy_api.latest import ComfyExtension, io -class CLIPTextEncodePixArtAlpha: +class CLIPTextEncodePixArtAlpha(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "width": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - "height": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - # "aspect_ratio": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "text": ("STRING", {"multiline": True, "dynamicPrompts": True}), "clip": ("CLIP", ), - }} + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodePixArtAlpha", + category="advanced/conditioning", + description="Encodes text and sets the resolution conditioning for PixArt Alpha. Does not apply to PixArt Sigma.", + inputs=[ + io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("height", default=1024, min=0, max=nodes.MAX_RESOLUTION), + # "aspect_ratio": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), + io.String.Input("text", multiline=True, dynamic_prompts=True), + io.Clip.Input("clip"), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" - CATEGORY = "advanced/conditioning" - DESCRIPTION = "Encodes text and sets the resolution conditioning for PixArt Alpha. Does not apply to PixArt Sigma." - - def encode(self, clip, width, height, text): + @classmethod + def execute(cls, clip, width, height, text): tokens = clip.tokenize(text) - return (clip.encode_from_tokens_scheduled(tokens, add_dict={"width": width, "height": height}),) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens, add_dict={"width": width, "height": height})) -NODE_CLASS_MAPPINGS = { - "CLIPTextEncodePixArtAlpha": CLIPTextEncodePixArtAlpha, -} + +class PixArtExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + CLIPTextEncodePixArtAlpha, + ] + +async def comfy_entrypoint() -> PixArtExtension: + return PixArtExtension() From 7eca95657cf7a70c15d598c969b890a164a300a1 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 12:36:43 +0300 Subject: [PATCH 338/759] convert nodes_photomaker.py to V3 schema (#10017) --- comfy_extras/nodes_photomaker.py | 74 ++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/comfy_extras/nodes_photomaker.py b/comfy_extras/nodes_photomaker.py index d358ed6d5..228183c07 100644 --- a/comfy_extras/nodes_photomaker.py +++ b/comfy_extras/nodes_photomaker.py @@ -4,6 +4,8 @@ import folder_paths import comfy.clip_model import comfy.clip_vision import comfy.ops +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io # code for model from: https://github.com/TencentARC/PhotoMaker/blob/main/photomaker/model.py under Apache License Version 2.0 VISION_CONFIG_DICT = { @@ -116,41 +118,52 @@ class PhotoMakerIDEncoder(comfy.clip_model.CLIPVisionModelProjection): return updated_prompt_embeds -class PhotoMakerLoader: +class PhotoMakerLoader(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "photomaker_model_name": (folder_paths.get_filename_list("photomaker"), )}} + def define_schema(cls): + return io.Schema( + node_id="PhotoMakerLoader", + category="_for_testing/photomaker", + inputs=[ + io.Combo.Input("photomaker_model_name", options=folder_paths.get_filename_list("photomaker")), + ], + outputs=[ + io.Photomaker.Output(), + ], + is_experimental=True, + ) - RETURN_TYPES = ("PHOTOMAKER",) - FUNCTION = "load_photomaker_model" - - CATEGORY = "_for_testing/photomaker" - - def load_photomaker_model(self, photomaker_model_name): + @classmethod + def execute(cls, photomaker_model_name): photomaker_model_path = folder_paths.get_full_path_or_raise("photomaker", photomaker_model_name) photomaker_model = PhotoMakerIDEncoder() data = comfy.utils.load_torch_file(photomaker_model_path, safe_load=True) if "id_encoder" in data: data = data["id_encoder"] photomaker_model.load_state_dict(data) - return (photomaker_model,) + return io.NodeOutput(photomaker_model) -class PhotoMakerEncode: +class PhotoMakerEncode(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "photomaker": ("PHOTOMAKER",), - "image": ("IMAGE",), - "clip": ("CLIP", ), - "text": ("STRING", {"multiline": True, "dynamicPrompts": True, "default": "photograph of photomaker"}), - }} + def define_schema(cls): + return io.Schema( + node_id="PhotoMakerEncode", + category="_for_testing/photomaker", + inputs=[ + io.Photomaker.Input("photomaker"), + io.Image.Input("image"), + io.Clip.Input("clip"), + io.String.Input("text", multiline=True, dynamic_prompts=True, default="photograph of photomaker"), + ], + outputs=[ + io.Conditioning.Output(), + ], + is_experimental=True, + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "apply_photomaker" - - CATEGORY = "_for_testing/photomaker" - - def apply_photomaker(self, photomaker, image, clip, text): + @classmethod + def execute(cls, photomaker, image, clip, text): special_token = "photomaker" pixel_values = comfy.clip_vision.clip_preprocess(image.to(photomaker.load_device)).float() try: @@ -178,11 +191,16 @@ class PhotoMakerEncode: else: out = cond - return ([[out, {"pooled_output": pooled}]], ) + return io.NodeOutput([[out, {"pooled_output": pooled}]]) -NODE_CLASS_MAPPINGS = { - "PhotoMakerLoader": PhotoMakerLoader, - "PhotoMakerEncode": PhotoMakerEncode, -} +class PhotomakerExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + PhotoMakerLoader, + PhotoMakerEncode, + ] +async def comfy_entrypoint() -> PhotomakerExtension: + return PhotomakerExtension() From 160698eb418269d64fbbe8c34db27a4d1ddb0540 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 27 Sep 2025 22:25:35 +0300 Subject: [PATCH 339/759] convert nodes_qwen.py to V3 schema (#10049) --- comfy_extras/nodes_qwen.py | 88 ++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 37 deletions(-) diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py index 49747dc7a..525239ae5 100644 --- a/comfy_extras/nodes_qwen.py +++ b/comfy_extras/nodes_qwen.py @@ -1,24 +1,29 @@ import node_helpers import comfy.utils import math +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class TextEncodeQwenImageEdit: +class TextEncodeQwenImageEdit(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), - }, - "optional": {"vae": ("VAE", ), - "image": ("IMAGE", ),}} + def define_schema(cls): + return io.Schema( + node_id="TextEncodeQwenImageEdit", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("prompt", multiline=True, dynamic_prompts=True), + io.Vae.Input("vae", optional=True), + io.Image.Input("image", optional=True), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" - - CATEGORY = "advanced/conditioning" - - def encode(self, clip, prompt, vae=None, image=None): + @classmethod + def execute(cls, clip, prompt, vae=None, image=None) -> io.NodeOutput: ref_latent = None if image is None: images = [] @@ -40,28 +45,30 @@ class TextEncodeQwenImageEdit: conditioning = clip.encode_from_tokens_scheduled(tokens) if ref_latent is not None: conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": [ref_latent]}, append=True) - return (conditioning, ) + return io.NodeOutput(conditioning) -class TextEncodeQwenImageEditPlus: +class TextEncodeQwenImageEditPlus(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), - }, - "optional": {"vae": ("VAE", ), - "image1": ("IMAGE", ), - "image2": ("IMAGE", ), - "image3": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="TextEncodeQwenImageEditPlus", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("prompt", multiline=True, dynamic_prompts=True), + io.Vae.Input("vae", optional=True), + io.Image.Input("image1", optional=True), + io.Image.Input("image2", optional=True), + io.Image.Input("image3", optional=True), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" - - CATEGORY = "advanced/conditioning" - - def encode(self, clip, prompt, vae=None, image1=None, image2=None, image3=None): + @classmethod + def execute(cls, clip, prompt, vae=None, image1=None, image2=None, image3=None) -> io.NodeOutput: ref_latents = [] images = [image1, image2, image3] images_vl = [] @@ -94,10 +101,17 @@ class TextEncodeQwenImageEditPlus: conditioning = clip.encode_from_tokens_scheduled(tokens) if len(ref_latents) > 0: conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": ref_latents}, append=True) - return (conditioning, ) + return io.NodeOutput(conditioning) -NODE_CLASS_MAPPINGS = { - "TextEncodeQwenImageEdit": TextEncodeQwenImageEdit, - "TextEncodeQwenImageEditPlus": TextEncodeQwenImageEditPlus, -} +class QwenExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TextEncodeQwenImageEdit, + TextEncodeQwenImageEditPlus, + ] + + +async def comfy_entrypoint() -> QwenExtension: + return QwenExtension() From 653ceab4148a9fbc050ebceb674acef760792b77 Mon Sep 17 00:00:00 2001 From: rattus128 <46076784+rattus128@users.noreply.github.com> Date: Sun, 28 Sep 2025 08:14:16 +1000 Subject: [PATCH 340/759] Reduce Peak WAN inference VRAM usage - part II (#10062) * flux: math: Use _addcmul to avoid expensive VRAM intermediate The rope process can be the VRAM peak and this intermediate for the addition result before releasing the original can OOM. addcmul_ it. * wan: Delete the self attention before cross attention This saves VRAM when the cross attention and FFN are in play as the VRAM peak. --- comfy/ldm/flux/math.py | 5 ++++- comfy/ldm/wan/model.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/comfy/ldm/flux/math.py b/comfy/ldm/flux/math.py index fb7cd7586..8deda0d4a 100644 --- a/comfy/ldm/flux/math.py +++ b/comfy/ldm/flux/math.py @@ -37,7 +37,10 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor: def apply_rope1(x: Tensor, freqs_cis: Tensor): x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2) - x_out = freqs_cis[..., 0] * x_[..., 0] + freqs_cis[..., 1] * x_[..., 1] + + x_out = freqs_cis[..., 0] * x_[..., 0] + x_out.addcmul_(freqs_cis[..., 1], x_[..., 1]) + return x_out.reshape(*x.shape).type_as(x) def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor): diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 54616e6eb..0dc650ced 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -237,6 +237,7 @@ class WanAttentionBlock(nn.Module): freqs, transformer_options=transformer_options) x = torch.addcmul(x, y, repeat_e(e[2], x)) + del y # cross-attention & ffn x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len, transformer_options=transformer_options) From 40ae495ddcbc04846e91ccad3e844bb34d98c6fd Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 27 Sep 2025 17:28:49 -0700 Subject: [PATCH 341/759] Improvements to the stable release workflow. (#10065) --- .github/workflows/stable-release.yml | 39 ++++++++++++------- .../windows_release_dependencies.yml | 3 +- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index 2bc8e5905..b39b42acd 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -8,11 +8,11 @@ on: description: 'Git tag' required: true type: string - cu: - description: 'CUDA version' + cache_tag: + description: 'Cached dependencies tag' required: true type: string - default: "129" + default: "cu129" python_minor: description: 'Python minor version' required: true @@ -23,7 +23,11 @@ on: required: true type: string default: "6" - + rel_name: + description: 'Release name' + required: true + type: string + default: "nvidia" jobs: package_comfy_windows: @@ -42,15 +46,15 @@ jobs: id: cache with: path: | - cu${{ inputs.cu }}_python_deps.tar + ${{ inputs.cache_tag }}_python_deps.tar update_comfyui_and_python_dependencies.bat - key: ${{ runner.os }}-build-cu${{ inputs.cu }}-${{ inputs.python_minor }} + key: ${{ runner.os }}-build-${{ inputs.cache_tag }}-${{ inputs.python_minor }} - shell: bash run: | - mv cu${{ inputs.cu }}_python_deps.tar ../ + mv ${{ inputs.cache_tag }}_python_deps.tar ../ mv update_comfyui_and_python_dependencies.bat ../ cd .. - tar xf cu${{ inputs.cu }}_python_deps.tar + tar xf ${{ inputs.cache_tag }}_python_deps.tar pwd ls @@ -65,12 +69,19 @@ jobs: echo 'import site' >> ./python3${{ inputs.python_minor }}._pth curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py ./python.exe get-pip.py - ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/* + ./python.exe -s -m pip install ../${{ inputs.cache_tag }}_python_deps/* + + grep comfyui ../ComfyUI/requirements.txt ./requirements_comfyui.txt + ./python.exe -s -m pip install -r requirements_comfyui.txt + rm requirements_comfyui.txt + sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth - rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space - rm ./Lib/site-packages/torch/lib/libprotoc.lib - rm ./Lib/site-packages/torch/lib/libprotobuf.lib + if test -f ./Lib/site-packages/torch/lib/dnnl.lib; then + rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space + rm ./Lib/site-packages/torch/lib/libprotoc.lib + rm ./Lib/site-packages/torch/lib/libprotobuf.lib + fi cd .. @@ -91,7 +102,7 @@ jobs: cd .. "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable - mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_nvidia.7z + mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_${{ inputs.rel_name }}.7z cd ComfyUI_windows_portable python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu @@ -104,7 +115,7 @@ jobs: uses: svenstaro/upload-release-action@v2 with: repo_token: ${{ secrets.GITHUB_TOKEN }} - file: ComfyUI_windows_portable_nvidia.7z + file: ComfyUI_windows_portable_${{ inputs.rel_name }}.7z tag: ${{ inputs.git_tag }} overwrite: true draft: true diff --git a/.github/workflows/windows_release_dependencies.yml b/.github/workflows/windows_release_dependencies.yml index 7761cc1ed..f1e2946e6 100644 --- a/.github/workflows/windows_release_dependencies.yml +++ b/.github/workflows/windows_release_dependencies.yml @@ -56,7 +56,8 @@ jobs: ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 pause" > update_comfyui_and_python_dependencies.bat - python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir + grep -v comfyui requirements.txt > requirements_nocomfyui.txt + python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements_nocomfyui.txt pygit2 -w ./temp_wheel_dir python -m pip install --no-cache-dir ./temp_wheel_dir/* echo installed basic ls -lah temp_wheel_dir From 896f2e653c02769371e113906d70a24306d87a58 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 27 Sep 2025 18:30:35 -0700 Subject: [PATCH 342/759] Fix typo in release workflow. (#10066) --- .github/workflows/stable-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index b39b42acd..619b0e995 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -71,7 +71,7 @@ jobs: ./python.exe get-pip.py ./python.exe -s -m pip install ../${{ inputs.cache_tag }}_python_deps/* - grep comfyui ../ComfyUI/requirements.txt ./requirements_comfyui.txt + grep comfyui ../ComfyUI/requirements.txt > ./requirements_comfyui.txt ./python.exe -s -m pip install -r requirements_comfyui.txt rm requirements_comfyui.txt From a1127b232d221432be065f8e765f3538e62a2f41 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 28 Sep 2025 05:11:36 +0300 Subject: [PATCH 343/759] convert nodes_lotus.py to V3 schema (#10057) --- comfy_extras/nodes_lotus.py | 42 +++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/comfy_extras/nodes_lotus.py b/comfy_extras/nodes_lotus.py index 739dbdd3d..9f62ba2bf 100644 --- a/comfy_extras/nodes_lotus.py +++ b/comfy_extras/nodes_lotus.py @@ -1,20 +1,22 @@ +from typing_extensions import override + import torch import comfy.model_management as mm +from comfy_api.latest import ComfyExtension, io -class LotusConditioning: + +class LotusConditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - }, - } + def define_schema(cls): + return io.Schema( + node_id="LotusConditioning", + category="conditioning/lotus", + inputs=[], + outputs=[io.Conditioning.Output(display_name="conditioning")], + ) - RETURN_TYPES = ("CONDITIONING",) - RETURN_NAMES = ("conditioning",) - FUNCTION = "conditioning" - CATEGORY = "conditioning/lotus" - - def conditioning(self): + @classmethod + def execute(cls) -> io.NodeOutput: device = mm.get_torch_device() #lotus uses a frozen encoder and null conditioning, i'm just inlining the results of that operation since it doesn't change #and getting parity with the reference implementation would otherwise require inference and 800mb of tensors @@ -22,8 +24,16 @@ class LotusConditioning: cond = [[prompt_embeds, {}]] - return (cond,) + return io.NodeOutput(cond) -NODE_CLASS_MAPPINGS = { - "LotusConditioning" : LotusConditioning, -} + +class LotusExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + LotusConditioning, + ] + + +async def comfy_entrypoint() -> LotusExtension: + return LotusExtension() From 1cf86f5ae5706ff141f8d51ed9ba96ecdcdcb695 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 28 Sep 2025 05:12:51 +0300 Subject: [PATCH 344/759] convert nodes_lumina2.py to V3 schema (#10058) --- comfy_extras/nodes_lumina2.py | 99 +++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 38 deletions(-) diff --git a/comfy_extras/nodes_lumina2.py b/comfy_extras/nodes_lumina2.py index 275189785..89ff2397a 100644 --- a/comfy_extras/nodes_lumina2.py +++ b/comfy_extras/nodes_lumina2.py @@ -1,20 +1,27 @@ -from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict +from typing_extensions import override import torch +from comfy_api.latest import ComfyExtension, io -class RenormCFG: + +class RenormCFG(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "cfg_trunc": ("FLOAT", {"default": 100, "min": 0.0, "max": 100.0, "step": 0.01}), - "renorm_cfg": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="RenormCFG", + category="advanced/model", + inputs=[ + io.Model.Input("model"), + io.Float.Input("cfg_trunc", default=100, min=0.0, max=100.0, step=0.01), + io.Float.Input("renorm_cfg", default=1.0, min=0.0, max=100.0, step=0.01), + ], + outputs=[ + io.Model.Output(), + ], + ) - CATEGORY = "advanced/model" - - def patch(self, model, cfg_trunc, renorm_cfg): + @classmethod + def execute(cls, model, cfg_trunc, renorm_cfg) -> io.NodeOutput: def renorm_cfg_func(args): cond_denoised = args["cond_denoised"] uncond_denoised = args["uncond_denoised"] @@ -53,10 +60,10 @@ class RenormCFG: m = model.clone() m.set_model_sampler_cfg_function(renorm_cfg_func) - return (m, ) + return io.NodeOutput(m) -class CLIPTextEncodeLumina2(ComfyNodeABC): +class CLIPTextEncodeLumina2(io.ComfyNode): SYSTEM_PROMPT = { "superior": "You are an assistant designed to generate superior images with the superior "\ "degree of image-text alignment based on textual prompts or user prompts.", @@ -69,36 +76,52 @@ class CLIPTextEncodeLumina2(ComfyNodeABC): "Alignment: You are an assistant designed to generate high-quality images with the highest "\ "degree of image-text alignment based on textual prompts." @classmethod - def INPUT_TYPES(s) -> InputTypeDict: - return { - "required": { - "system_prompt": (list(CLIPTextEncodeLumina2.SYSTEM_PROMPT.keys()), {"tooltip": CLIPTextEncodeLumina2.SYSTEM_PROMPT_TIP}), - "user_prompt": (IO.STRING, {"multiline": True, "dynamicPrompts": True, "tooltip": "The text to be encoded."}), - "clip": (IO.CLIP, {"tooltip": "The CLIP model used for encoding the text."}) - } - } - RETURN_TYPES = (IO.CONDITIONING,) - OUTPUT_TOOLTIPS = ("A conditioning containing the embedded text used to guide the diffusion model.",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodeLumina2", + display_name="CLIP Text Encode for Lumina2", + category="conditioning", + description="Encodes a system prompt and a user prompt using a CLIP model into an embedding " + "that can be used to guide the diffusion model towards generating specific images.", + inputs=[ + io.Combo.Input( + "system_prompt", + options=list(cls.SYSTEM_PROMPT.keys()), + tooltip=cls.SYSTEM_PROMPT_TIP, + ), + io.String.Input( + "user_prompt", + multiline=True, + dynamic_prompts=True, + tooltip="The text to be encoded.", + ), + io.Clip.Input("clip", tooltip="The CLIP model used for encoding the text."), + ], + outputs=[ + io.Conditioning.Output( + tooltip="A conditioning containing the embedded text used to guide the diffusion model.", + ), + ], + ) - CATEGORY = "conditioning" - DESCRIPTION = "Encodes a system prompt and a user prompt using a CLIP model into an embedding that can be used to guide the diffusion model towards generating specific images." - - def encode(self, clip, user_prompt, system_prompt): + @classmethod + def execute(cls, clip, user_prompt, system_prompt) -> io.NodeOutput: if clip is None: raise RuntimeError("ERROR: clip input is invalid: None\n\nIf the clip is from a checkpoint loader node your checkpoint does not contain a valid clip or text encoder model.") - system_prompt = CLIPTextEncodeLumina2.SYSTEM_PROMPT[system_prompt] + system_prompt = cls.SYSTEM_PROMPT[system_prompt] prompt = f'{system_prompt} {user_prompt}' tokens = clip.tokenize(prompt) - return (clip.encode_from_tokens_scheduled(tokens), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens)) -NODE_CLASS_MAPPINGS = { - "CLIPTextEncodeLumina2": CLIPTextEncodeLumina2, - "RenormCFG": RenormCFG -} +class Lumina2Extension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + CLIPTextEncodeLumina2, + RenormCFG, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "CLIPTextEncodeLumina2": "CLIP Text Encode for Lumina2", -} +async def comfy_entrypoint() -> Lumina2Extension: + return Lumina2Extension() From 2dadb348602f8f452eb2a1d8720f6029dc4039a2 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 28 Sep 2025 05:16:22 +0300 Subject: [PATCH 345/759] convert nodes_hypertile.py to V3 schema (#10061) --- comfy_extras/nodes_hypertile.py | 59 +++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/comfy_extras/nodes_hypertile.py b/comfy_extras/nodes_hypertile.py index b366117c7..0ad5e6773 100644 --- a/comfy_extras/nodes_hypertile.py +++ b/comfy_extras/nodes_hypertile.py @@ -1,9 +1,11 @@ #Taken from: https://github.com/tfernd/HyperTile/ import math +from typing_extensions import override from einops import rearrange # Use torch rng for consistency across generations from torch import randint +from comfy_api.latest import ComfyExtension, io def random_divisor(value: int, min_value: int, /, max_options: int = 1) -> int: min_value = min(min_value, value) @@ -20,25 +22,31 @@ def random_divisor(value: int, min_value: int, /, max_options: int = 1) -> int: return ns[idx] -class HyperTile: +class HyperTile(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "tile_size": ("INT", {"default": 256, "min": 1, "max": 2048}), - "swap_size": ("INT", {"default": 2, "min": 1, "max": 128}), - "max_depth": ("INT", {"default": 0, "min": 0, "max": 10}), - "scale_depth": ("BOOLEAN", {"default": False}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="HyperTile", + category="model_patches/unet", + inputs=[ + io.Model.Input("model"), + io.Int.Input("tile_size", default=256, min=1, max=2048), + io.Int.Input("swap_size", default=2, min=1, max=128), + io.Int.Input("max_depth", default=0, min=0, max=10), + io.Boolean.Input("scale_depth", default=False), + ], + outputs=[ + io.Model.Output(), + ], + ) - CATEGORY = "model_patches/unet" - - def patch(self, model, tile_size, swap_size, max_depth, scale_depth): + @classmethod + def execute(cls, model, tile_size, swap_size, max_depth, scale_depth) -> io.NodeOutput: latent_tile_size = max(32, tile_size) // 8 - self.temp = None + temp = None def hypertile_in(q, k, v, extra_options): + nonlocal temp model_chans = q.shape[-2] orig_shape = extra_options['original_shape'] apply_to = [] @@ -58,14 +66,15 @@ class HyperTile: if nh * nw > 1: q = rearrange(q, "b (nh h nw w) c -> (b nh nw) (h w) c", h=h // nh, w=w // nw, nh=nh, nw=nw) - self.temp = (nh, nw, h, w) + temp = (nh, nw, h, w) return q, k, v return q, k, v def hypertile_out(out, extra_options): - if self.temp is not None: - nh, nw, h, w = self.temp - self.temp = None + nonlocal temp + if temp is not None: + nh, nw, h, w = temp + temp = None out = rearrange(out, "(b nh nw) hw c -> b nh nw hw c", nh=nh, nw=nw) out = rearrange(out, "b nh nw (h w) c -> b (nh h nw w) c", h=h // nh, w=w // nw) return out @@ -76,6 +85,14 @@ class HyperTile: m.set_model_attn1_output_patch(hypertile_out) return (m, ) -NODE_CLASS_MAPPINGS = { - "HyperTile": HyperTile, -} + +class HyperTileExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + HyperTile, + ] + + +async def comfy_entrypoint() -> HyperTileExtension: + return HyperTileExtension() From 1364548c721a466adcdc60e49ee291b0d4255245 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rui=20Wang=20=28=E7=8E=8B=E7=91=9E=29?= Date: Sun, 28 Sep 2025 10:36:02 +0800 Subject: [PATCH 346/759] feat: ComfyUI can be run on the specified Ascend NPU (#9663) * feature: Set the Ascend NPU to use a single one * Enable the `--cuda-device` parameter to support both CUDA and Ascend NPUs simultaneously. * Make the code just set the ASCENT_RT_VISIBLE_DEVICES environment variable without any other edits to master branch --------- Co-authored-by: Jedrzej Kosinski --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index c33f0e17b..70696fcc3 100644 --- a/main.py +++ b/main.py @@ -127,6 +127,7 @@ if __name__ == "__main__": if args.cuda_device is not None: os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda_device) os.environ['HIP_VISIBLE_DEVICES'] = str(args.cuda_device) + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = str(args.cuda_device) logging.info("Set cuda device to: {}".format(args.cuda_device)) if args.oneapi_device_selector is not None: From 555f902fc1ed20e98201f9102172f0fc190c2c42 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 27 Sep 2025 19:43:25 -0700 Subject: [PATCH 347/759] Fix stable workflow creating multiple draft releases. (#10067) --- .github/workflows/stable-release.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index 619b0e995..924bdec90 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -112,10 +112,9 @@ jobs: ls - name: Upload binaries to release - uses: svenstaro/upload-release-action@v2 + uses: softprops/action-gh-release@v2 with: - repo_token: ${{ secrets.GITHUB_TOKEN }} - file: ComfyUI_windows_portable_${{ inputs.rel_name }}.7z - tag: ${{ inputs.git_tag }} - overwrite: true + files: ComfyUI_windows_portable_${{ inputs.rel_name }}.7z + tag_name: ${{ inputs.git_tag }} draft: true + overwrite_files: true From b60dc316272ba139e06b8a7b2f5f5b622c9afe20 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 28 Sep 2025 10:41:32 -0700 Subject: [PATCH 348/759] Update command to install latest nighly pytorch. (#10085) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3f6cfc2ed..5a257687b 100644 --- a/README.md +++ b/README.md @@ -233,7 +233,7 @@ Nvidia users should install stable pytorch using this command: This is the command to install pytorch nightly instead which might have performance improvements. -```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129``` +```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130``` #### Troubleshooting From 6ec1cfe101206229ff3af5c3d3675b3b92477067 Mon Sep 17 00:00:00 2001 From: Changrz <51637999+WhiteGiven@users.noreply.github.com> Date: Tue, 30 Sep 2025 02:59:12 +0800 Subject: [PATCH 349/759] [Rodin3d api nodes] Updated the name of the save file path (changed from timestamp to UUID). (#10011) * Update savepath name from time to uuid * delete lib --- comfy_api_nodes/nodes_rodin.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index 1af393eba..817efb0f5 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -11,7 +11,6 @@ from comfy.comfy_types.node_typing import IO import folder_paths as comfy_paths import aiohttp import os -import datetime import asyncio import io import logging @@ -243,8 +242,8 @@ class Rodin3DAPI: return mesh_mode, quality_override - async def download_files(self, url_list): - save_path = os.path.join(comfy_paths.get_output_directory(), "Rodin3D", datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) + async def download_files(self, url_list, task_uuid): + save_path = os.path.join(comfy_paths.get_output_directory(), f"Rodin3D_{task_uuid}") os.makedirs(save_path, exist_ok=True) model_file_path = None async with aiohttp.ClientSession() as session: @@ -320,7 +319,7 @@ class Rodin3D_Regular(Rodin3DAPI): **kwargs) await self.poll_for_task_status(subscription_key, **kwargs) download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list) + model = await self.download_files(download_list, task_uuid) return (model,) @@ -366,7 +365,7 @@ class Rodin3D_Detail(Rodin3DAPI): **kwargs) await self.poll_for_task_status(subscription_key, **kwargs) download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list) + model = await self.download_files(download_list, task_uuid) return (model,) @@ -412,7 +411,7 @@ class Rodin3D_Smooth(Rodin3DAPI): **kwargs) await self.poll_for_task_status(subscription_key, **kwargs) download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list) + model = await self.download_files(download_list, task_uuid) return (model,) @@ -467,7 +466,7 @@ class Rodin3D_Sketch(Rodin3DAPI): ) await self.poll_for_task_status(subscription_key, **kwargs) download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list) + model = await self.download_files(download_list, task_uuid) return (model,) From c8276f8c6bee54b494fd5bec8dfb87ed21a3fa65 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 30 Sep 2025 02:59:42 +0800 Subject: [PATCH 350/759] Update template to 0.1.91 (#10096) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b3f81e8fa..45d3e1607 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.26.13 -comfyui-workflow-templates==0.1.88 +comfyui-workflow-templates==0.1.91 comfyui-embedded-docs==0.2.6 torch torchsde From 05a258efd84bfb00e2618eb9b7937b8fef1e82ed Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:01:04 +0300 Subject: [PATCH 351/759] add WanImageToImageApi node (#10094) --- comfy_api_nodes/nodes_wan.py | 149 ++++++++++++++++++++++++++++++++++- 1 file changed, 148 insertions(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_wan.py b/comfy_api_nodes/nodes_wan.py index db5bd41c1..0be5daadb 100644 --- a/comfy_api_nodes/nodes_wan.py +++ b/comfy_api_nodes/nodes_wan.py @@ -28,6 +28,12 @@ class Text2ImageInputField(BaseModel): negative_prompt: Optional[str] = Field(None) +class Image2ImageInputField(BaseModel): + prompt: str = Field(...) + negative_prompt: Optional[str] = Field(None) + images: list[str] = Field(..., min_length=1, max_length=2) + + class Text2VideoInputField(BaseModel): prompt: str = Field(...) negative_prompt: Optional[str] = Field(None) @@ -49,6 +55,13 @@ class Txt2ImageParametersField(BaseModel): watermark: bool = Field(True) +class Image2ImageParametersField(BaseModel): + size: Optional[str] = Field(None) + n: int = Field(1, description="Number of images to generate.") # we support only value=1 + seed: int = Field(..., ge=0, le=2147483647) + watermark: bool = Field(True) + + class Text2VideoParametersField(BaseModel): size: str = Field(...) seed: int = Field(..., ge=0, le=2147483647) @@ -73,6 +86,12 @@ class Text2ImageTaskCreationRequest(BaseModel): parameters: Txt2ImageParametersField = Field(...) +class Image2ImageTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Image2ImageInputField = Field(...) + parameters: Image2ImageParametersField = Field(...) + + class Text2VideoTaskCreationRequest(BaseModel): model: str = Field(...) input: Text2VideoInputField = Field(...) @@ -135,7 +154,12 @@ async def process_task( url: str, request_model: Type[T], response_model: Type[R], - payload: Union[Text2ImageTaskCreationRequest, Text2VideoTaskCreationRequest, Image2VideoTaskCreationRequest], + payload: Union[ + Text2ImageTaskCreationRequest, + Image2ImageTaskCreationRequest, + Text2VideoTaskCreationRequest, + Image2VideoTaskCreationRequest, + ], node_id: str, estimated_duration: int, poll_interval: int, @@ -288,6 +312,128 @@ class WanTextToImageApi(comfy_io.ComfyNode): return comfy_io.NodeOutput(await download_url_to_image_tensor(str(response.output.results[0].url))) +class WanImageToImageApi(comfy_io.ComfyNode): + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="WanImageToImageApi", + display_name="Wan Image to Image", + category="api node/image/Wan", + description="Generates an image from one or two input images and a text prompt. " + "The output image is currently fixed at 1.6 MP; its aspect ratio matches the input image(s).", + inputs=[ + comfy_io.Combo.Input( + "model", + options=["wan2.5-i2i-preview"], + default="wan2.5-i2i-preview", + tooltip="Model to use.", + ), + comfy_io.Image.Input( + "image", + tooltip="Single-image editing or multi-image fusion, maximum 2 images.", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", + ), + comfy_io.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative text prompt to guide what to avoid.", + optional=True, + ), + # redo this later as an optional combo of recommended resolutions + # comfy_io.Int.Input( + # "width", + # default=1280, + # min=384, + # max=1440, + # step=16, + # optional=True, + # ), + # comfy_io.Int.Input( + # "height", + # default=1280, + # min=384, + # max=1440, + # step=16, + # optional=True, + # ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + optional=True, + ), + comfy_io.Boolean.Input( + "watermark", + default=True, + tooltip="Whether to add an \"AI generated\" watermark to the result.", + optional=True, + ), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + image: torch.Tensor, + prompt: str, + negative_prompt: str = "", + # width: int = 1024, + # height: int = 1024, + seed: int = 0, + watermark: bool = True, + ): + n_images = get_number_of_images(image) + if n_images not in (1, 2): + raise ValueError(f"Expected 1 or 2 input images, got {n_images}.") + images = [] + for i in image: + images.append("data:image/png;base64," + tensor_to_base64_string(i, total_pixels=4096*4096)) + payload = Image2ImageTaskCreationRequest( + model=model, + input=Image2ImageInputField(prompt=prompt, negative_prompt=negative_prompt, images=images), + parameters=Image2ImageParametersField( + # size=f"{width}*{height}", + seed=seed, + watermark=watermark, + ), + ) + response = await process_task( + { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + "/proxy/wan/api/v1/services/aigc/image2image/image-synthesis", + request_model=Image2ImageTaskCreationRequest, + response_model=ImageTaskStatusResponse, + payload=payload, + node_id=cls.hidden.unique_id, + estimated_duration=42, + poll_interval=3, + ) + return comfy_io.NodeOutput(await download_url_to_image_tensor(str(response.output.results[0].url))) + + class WanTextToVideoApi(comfy_io.ComfyNode): @classmethod def define_schema(cls): @@ -593,6 +739,7 @@ class WanApiExtension(ComfyExtension): async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: return [ WanTextToImageApi, + WanImageToImageApi, WanTextToVideoApi, WanImageToVideoApi, ] From b1111c2062ce35d4292bcd94f27c099a13c619cb Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:03:35 +0300 Subject: [PATCH 352/759] convert nodes_mochi.py to V3 schema (#10069) --- comfy_extras/nodes_mochi.py | 49 +++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/comfy_extras/nodes_mochi.py b/comfy_extras/nodes_mochi.py index 1c474faa9..d750194fc 100644 --- a/comfy_extras/nodes_mochi.py +++ b/comfy_extras/nodes_mochi.py @@ -1,23 +1,40 @@ -import nodes +from typing_extensions import override import torch import comfy.model_management +import nodes +from comfy_api.latest import ComfyExtension, io -class EmptyMochiLatentVideo: + +class EmptyMochiLatentVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 25, "min": 7, "max": nodes.MAX_RESOLUTION, "step": 6}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" + def define_schema(cls): + return io.Schema( + node_id="EmptyMochiLatentVideo", + category="latent/video", + inputs=[ + io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=25, min=7, max=nodes.MAX_RESOLUTION, step=6), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(), + ], + ) - CATEGORY = "latent/video" - - def generate(self, width, height, length, batch_size=1): + @classmethod + def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput: latent = torch.zeros([batch_size, 12, ((length - 1) // 6) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - return ({"samples":latent}, ) + return io.NodeOutput({"samples": latent}) -NODE_CLASS_MAPPINGS = { - "EmptyMochiLatentVideo": EmptyMochiLatentVideo, -} + +class MochiExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + EmptyMochiLatentVideo, + ] + + +async def comfy_entrypoint() -> MochiExtension: + return MochiExtension() From 041b8824f50e01803637d5e83c3f4edaf628f43a Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:05:28 +0300 Subject: [PATCH 353/759] convert nodes_perpneg.py to V3 schema (#10081) --- comfy_extras/nodes_perpneg.py | 93 +++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 38 deletions(-) diff --git a/comfy_extras/nodes_perpneg.py b/comfy_extras/nodes_perpneg.py index 89e5eef90..cd068ce9c 100644 --- a/comfy_extras/nodes_perpneg.py +++ b/comfy_extras/nodes_perpneg.py @@ -5,6 +5,9 @@ import comfy.samplers import comfy.utils import node_helpers import math +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io + def perp_neg(x, noise_pred_pos, noise_pred_neg, noise_pred_nocond, neg_scale, cond_scale): pos = noise_pred_pos - noise_pred_nocond @@ -16,20 +19,27 @@ def perp_neg(x, noise_pred_pos, noise_pred_neg, noise_pred_nocond, neg_scale, co return cfg_result #TODO: This node should be removed, it has been replaced with PerpNegGuider -class PerpNeg: +class PerpNeg(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"model": ("MODEL", ), - "empty_conditioning": ("CONDITIONING", ), - "neg_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="PerpNeg", + display_name="Perp-Neg (DEPRECATED by PerpNegGuider)", + category="_for_testing", + inputs=[ + io.Model.Input("model"), + io.Conditioning.Input("empty_conditioning"), + io.Float.Input("neg_scale", default=1.0, min=0.0, max=100.0, step=0.01), + ], + outputs=[ + io.Model.Output(), + ], + is_experimental=True, + is_deprecated=True, + ) - CATEGORY = "_for_testing" - DEPRECATED = True - - def patch(self, model, empty_conditioning, neg_scale): + @classmethod + def execute(cls, model, empty_conditioning, neg_scale) -> io.NodeOutput: m = model.clone() nocond = comfy.sampler_helpers.convert_cond(empty_conditioning) @@ -50,7 +60,7 @@ class PerpNeg: m.set_model_sampler_cfg_function(cfg_function) - return (m, ) + return io.NodeOutput(m) class Guider_PerpNeg(comfy.samplers.CFGGuider): @@ -112,35 +122,42 @@ class Guider_PerpNeg(comfy.samplers.CFGGuider): return cfg_result -class PerpNegGuider: +class PerpNegGuider(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model": ("MODEL",), - "positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "empty_conditioning": ("CONDITIONING", ), - "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}), - "neg_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}), - } - } + def define_schema(cls): + return io.Schema( + node_id="PerpNegGuider", + category="_for_testing", + inputs=[ + io.Model.Input("model"), + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Conditioning.Input("empty_conditioning"), + io.Float.Input("cfg", default=8.0, min=0.0, max=100.0, step=0.1, round=0.01), + io.Float.Input("neg_scale", default=1.0, min=0.0, max=100.0, step=0.01), + ], + outputs=[ + io.Guider.Output(), + ], + is_experimental=True, + ) - RETURN_TYPES = ("GUIDER",) - - FUNCTION = "get_guider" - CATEGORY = "_for_testing" - - def get_guider(self, model, positive, negative, empty_conditioning, cfg, neg_scale): + @classmethod + def execute(cls, model, positive, negative, empty_conditioning, cfg, neg_scale) -> io.NodeOutput: guider = Guider_PerpNeg(model) guider.set_conds(positive, negative, empty_conditioning) guider.set_cfg(cfg, neg_scale) - return (guider,) + return io.NodeOutput(guider) -NODE_CLASS_MAPPINGS = { - "PerpNeg": PerpNeg, - "PerpNegGuider": PerpNegGuider, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "PerpNeg": "Perp-Neg (DEPRECATED by PerpNegGuider)", -} +class PerpNegExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + PerpNeg, + PerpNegGuider, + ] + + +async def comfy_entrypoint() -> PerpNegExtension: + return PerpNegExtension() From ed0f4a609b5e6821f97db5cb1715068c25f78e7b Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Mon, 29 Sep 2025 12:16:02 -0700 Subject: [PATCH 354/759] dont cache new locale entry points (#10101) --- middleware/cache_middleware.py | 11 ++++++----- tests-unit/server_test/test_cache_control.py | 7 +++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/middleware/cache_middleware.py b/middleware/cache_middleware.py index 374ef7934..f02135369 100644 --- a/middleware/cache_middleware.py +++ b/middleware/cache_middleware.py @@ -26,11 +26,12 @@ async def cache_control( """Cache control middleware that sets appropriate cache headers based on file type and response status""" response: web.Response = await handler(request) - if ( - request.path.endswith(".js") - or request.path.endswith(".css") - or request.path.endswith("index.json") - ): + path_filename = request.path.rsplit("/", 1)[-1] + is_entry_point = path_filename.startswith("index") and path_filename.endswith( + ".json" + ) + + if request.path.endswith(".js") or request.path.endswith(".css") or is_entry_point: response.headers.setdefault("Cache-Control", "no-cache") return response diff --git a/tests-unit/server_test/test_cache_control.py b/tests-unit/server_test/test_cache_control.py index 8de59125a..fa68d9408 100644 --- a/tests-unit/server_test/test_cache_control.py +++ b/tests-unit/server_test/test_cache_control.py @@ -48,6 +48,13 @@ CACHE_SCENARIOS = [ "expected_cache": "no-cache", "should_have_header": True, }, + { + "name": "localized_index_json_no_cache", + "path": "/templates/index.zh.json", + "status": 200, + "expected_cache": "no-cache", + "should_have_header": True, + }, # Non-matching files { "name": "html_no_header", From 8accf50908094d9cd39168981fa5394274d25491 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:35:51 +0300 Subject: [PATCH 355/759] convert nodes_mahiro.py to V3 schema (#10070) --- comfy_extras/nodes_mahiro.py | 50 ++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/comfy_extras/nodes_mahiro.py b/comfy_extras/nodes_mahiro.py index 8fcdfba75..07b3353f4 100644 --- a/comfy_extras/nodes_mahiro.py +++ b/comfy_extras/nodes_mahiro.py @@ -1,17 +1,29 @@ +from typing_extensions import override import torch import torch.nn.functional as F -class Mahiro: +from comfy_api.latest import ComfyExtension, io + + +class Mahiro(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"model": ("MODEL",), - }} - RETURN_TYPES = ("MODEL",) - RETURN_NAMES = ("patched_model",) - FUNCTION = "patch" - CATEGORY = "_for_testing" - DESCRIPTION = "Modify the guidance to scale more on the 'direction' of the positive prompt rather than the difference between the negative prompt." - def patch(self, model): + def define_schema(cls): + return io.Schema( + node_id="Mahiro", + display_name="Mahiro is so cute that she deserves a better guidance function!! (。・ω・。)", + category="_for_testing", + description="Modify the guidance to scale more on the 'direction' of the positive prompt rather than the difference between the negative prompt.", + inputs=[ + io.Model.Input("model"), + ], + outputs=[ + io.Model.Output(display_name="patched_model"), + ], + is_experimental=True, + ) + + @classmethod + def execute(cls, model) -> io.NodeOutput: m = model.clone() def mahiro_normd(args): scale: float = args['cond_scale'] @@ -30,12 +42,16 @@ class Mahiro: wm = (simsc*cfg + (4-simsc)*leap) / 4 return wm m.set_model_sampler_post_cfg_function(mahiro_normd) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "Mahiro": Mahiro -} -NODE_DISPLAY_NAME_MAPPINGS = { - "Mahiro": "Mahiro is so cute that she deserves a better guidance function!! (。・ω・。)", -} +class MahiroExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + Mahiro, + ] + + +async def comfy_entrypoint() -> MahiroExtension: + return MahiroExtension() From 7f38e4c538de2fa38d0539c18577cdd0e5d251c2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 29 Sep 2025 14:27:52 -0700 Subject: [PATCH 356/759] Add action to create cached deps with manually specified torch. (#10102) --- .../windows_release_dependencies_manual.yml | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 .github/workflows/windows_release_dependencies_manual.yml diff --git a/.github/workflows/windows_release_dependencies_manual.yml b/.github/workflows/windows_release_dependencies_manual.yml new file mode 100644 index 000000000..0799feef1 --- /dev/null +++ b/.github/workflows/windows_release_dependencies_manual.yml @@ -0,0 +1,64 @@ +name: "Windows Release dependencies Manual" + +on: + workflow_dispatch: + inputs: + torch_dependencies: + description: 'torch dependencies' + required: false + type: string + default: "torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu128" + cache_tag: + description: 'Cached dependencies tag' + required: true + type: string + default: "cu128" + + python_minor: + description: 'python minor version' + required: true + type: string + default: "12" + + python_patch: + description: 'python patch version' + required: true + type: string + default: "10" + +jobs: + build_dependencies: + runs-on: windows-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.${{ inputs.python_minor }}.${{ inputs.python_patch }} + + - shell: bash + run: | + echo "@echo off + call update_comfyui.bat nopause + echo - + echo This will try to update pytorch and all python dependencies. + echo - + echo If you just want to update normally, close this and run update_comfyui.bat instead. + echo - + pause + ..\python_embeded\python.exe -s -m pip install --upgrade ${{ inputs.torch_dependencies }} -r ../ComfyUI/requirements.txt pygit2 + pause" > update_comfyui_and_python_dependencies.bat + + grep -v comfyui requirements.txt > requirements_nocomfyui.txt + python -m pip wheel --no-cache-dir ${{ inputs.torch_dependencies }} -r requirements_nocomfyui.txt pygit2 -w ./temp_wheel_dir + python -m pip install --no-cache-dir ./temp_wheel_dir/* + echo installed basic + ls -lah temp_wheel_dir + mv temp_wheel_dir ${{ inputs.cache_tag }}_python_deps + tar cf ${{ inputs.cache_tag }}_python_deps.tar ${{ inputs.cache_tag }}_python_deps + + - uses: actions/cache/save@v4 + with: + path: | + ${{ inputs.cache_tag }}_python_deps.tar + update_comfyui_and_python_dependencies.bat + key: ${{ runner.os }}-build-${{ inputs.cache_tag }}-${{ inputs.python_minor }} From 1673ace19b9d63a8dc0d388aafdb54abf2497892 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 29 Sep 2025 16:08:42 -0700 Subject: [PATCH 357/759] Make the final release test optional in the stable release action. (#10103) --- .github/workflows/stable-release.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index 924bdec90..5eb4a0783 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -28,6 +28,11 @@ on: required: true type: string default: "nvidia" + test_release: + description: 'Test Release' + required: true + type: boolean + default: true jobs: package_comfy_windows: @@ -104,6 +109,10 @@ jobs: "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_${{ inputs.rel_name }}.7z + - shell: bash + if: ${{ inputs.test_release }} + run: | + cd .. cd ComfyUI_windows_portable python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu From 0db6aabed3942ea71258d25d32dc971a2a2421af Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 29 Sep 2025 16:54:05 -0700 Subject: [PATCH 358/759] Different base files for different release. (#10104) --- .github/workflows/stable-release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index 5eb4a0783..40e1bc157 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -101,7 +101,7 @@ jobs: mkdir update cp -r ComfyUI/.ci/update_windows/* ./update/ - cp -r ComfyUI/.ci/windows_base_files/* ./ + cp -r ComfyUI/.ci/windows_${{ inputs.rel_name }}_base_files/* ./ cp ../update_comfyui_and_python_dependencies.bat ./update/ cd .. From 375884842314a2234ddc29132b03c741ce81443b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 29 Sep 2025 16:54:37 -0700 Subject: [PATCH 359/759] Different base files for nvidia and amd portables. (#10105) --- .../run_amd_gpu.bat} | 0 .../README_VERY_IMPORTANT.txt | 0 .../run_cpu.bat | 0 .ci/windows_nvidia_base_files/run_nvidia_gpu.bat | 2 ++ .../run_nvidia_gpu_fast_fp16_accumulation.bat | 0 .github/workflows/windows_release_nightly_pytorch.yml | 2 +- .github/workflows/windows_release_package.yml | 2 +- 7 files changed, 4 insertions(+), 2 deletions(-) rename .ci/{windows_base_files/run_nvidia_gpu.bat => windows_amd_base_files/run_amd_gpu.bat} (100%) rename .ci/{windows_base_files => windows_nvidia_base_files}/README_VERY_IMPORTANT.txt (100%) rename .ci/{windows_base_files => windows_nvidia_base_files}/run_cpu.bat (100%) create mode 100755 .ci/windows_nvidia_base_files/run_nvidia_gpu.bat rename .ci/{windows_base_files => windows_nvidia_base_files}/run_nvidia_gpu_fast_fp16_accumulation.bat (100%) diff --git a/.ci/windows_base_files/run_nvidia_gpu.bat b/.ci/windows_amd_base_files/run_amd_gpu.bat similarity index 100% rename from .ci/windows_base_files/run_nvidia_gpu.bat rename to .ci/windows_amd_base_files/run_amd_gpu.bat diff --git a/.ci/windows_base_files/README_VERY_IMPORTANT.txt b/.ci/windows_nvidia_base_files/README_VERY_IMPORTANT.txt similarity index 100% rename from .ci/windows_base_files/README_VERY_IMPORTANT.txt rename to .ci/windows_nvidia_base_files/README_VERY_IMPORTANT.txt diff --git a/.ci/windows_base_files/run_cpu.bat b/.ci/windows_nvidia_base_files/run_cpu.bat similarity index 100% rename from .ci/windows_base_files/run_cpu.bat rename to .ci/windows_nvidia_base_files/run_cpu.bat diff --git a/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat b/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat new file mode 100755 index 000000000..274d7c948 --- /dev/null +++ b/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat @@ -0,0 +1,2 @@ +.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build +pause diff --git a/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat b/.ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat similarity index 100% rename from .ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat rename to .ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat diff --git a/.github/workflows/windows_release_nightly_pytorch.yml b/.github/workflows/windows_release_nightly_pytorch.yml index 5bdc940de..ca1ef71ae 100644 --- a/.github/workflows/windows_release_nightly_pytorch.yml +++ b/.github/workflows/windows_release_nightly_pytorch.yml @@ -68,7 +68,7 @@ jobs: mkdir update cp -r ComfyUI/.ci/update_windows/* ./update/ - cp -r ComfyUI/.ci/windows_base_files/* ./ + cp -r ComfyUI/.ci/windows_nvidia_base_files/* ./ cp -r ComfyUI/.ci/windows_nightly_base_files/* ./ echo "call update_comfyui.bat nopause diff --git a/.github/workflows/windows_release_package.yml b/.github/workflows/windows_release_package.yml index 46375698e..7955325fc 100644 --- a/.github/workflows/windows_release_package.yml +++ b/.github/workflows/windows_release_package.yml @@ -81,7 +81,7 @@ jobs: mkdir update cp -r ComfyUI/.ci/update_windows/* ./update/ - cp -r ComfyUI/.ci/windows_base_files/* ./ + cp -r ComfyUI/.ci/windows_nvidia_base_files/* ./ cp ../update_comfyui_and_python_dependencies.bat ./update/ cd .. From 342cf644ce495dafaa31dd49d42c47c5e242e701 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 29 Sep 2025 17:05:44 -0700 Subject: [PATCH 360/759] Add a way to have different names for stable nvidia portables. (#10106) --- .github/workflows/stable-release.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index 40e1bc157..1cbbfbf69 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -28,6 +28,11 @@ on: required: true type: string default: "nvidia" + rel_extra_name: + description: 'Release extra name' + required: false + type: string + default: "" test_release: description: 'Test Release' required: true @@ -107,7 +112,7 @@ jobs: cd .. "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable - mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_${{ inputs.rel_name }}.7z + mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_${{ inputs.rel_name }}${{ inputs.rel_extra_name }}.7z - shell: bash if: ${{ inputs.test_release }} @@ -123,7 +128,7 @@ jobs: - name: Upload binaries to release uses: softprops/action-gh-release@v2 with: - files: ComfyUI_windows_portable_${{ inputs.rel_name }}.7z + files: ComfyUI_windows_portable_${{ inputs.rel_name }}${{ inputs.rel_extra_name }}.7z tag_name: ${{ inputs.git_tag }} draft: true overwrite_files: true From bed4b49d08d80e195cb42d5294037fc6b631942e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 29 Sep 2025 17:31:15 -0700 Subject: [PATCH 361/759] Add action to do the full stable release. (#10107) --- .github/workflows/release-stable-all.yml | 49 ++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 .github/workflows/release-stable-all.yml diff --git a/.github/workflows/release-stable-all.yml b/.github/workflows/release-stable-all.yml new file mode 100644 index 000000000..aac84d637 --- /dev/null +++ b/.github/workflows/release-stable-all.yml @@ -0,0 +1,49 @@ +name: "Release Stable All Portable Versions" + +on: + workflow_dispatch: + inputs: + git_tag: + description: 'Git tag' + required: true + type: string + +jobs: + release_nvidia_default: + name: "Release NVIDIA Default (cu129)" + uses: ./.github/workflows/stable-release.yml + with: + git_tag: ${{ inputs.git_tag }} + cache_tag: "cu129" + python_minor: "13" + python_patch: "6" + rel_name: "nvidia" + rel_extra_name: "" + test_release: true + secrets: inherit + + release_nvidia_cu128: + name: "Release NVIDIA cu128" + uses: ./.github/workflows/stable-release.yml + with: + git_tag: ${{ inputs.git_tag }} + cache_tag: "cu128" + python_minor: "12" + python_patch: "10" + rel_name: "nvidia" + rel_extra_name: "_cu128" + test_release: true + secrets: inherit + + release_amd_rocm: + name: "Release AMD ROCm 6.4.4" + uses: ./.github/workflows/stable-release.yml + with: + git_tag: ${{ inputs.git_tag }} + cache_tag: "rocm644" + python_minor: "12" + python_patch: "10" + rel_name: "amd" + rel_extra_name: "" + test_release: false + secrets: inherit From 447884b65740d9f4160ef13d55adb49ca111140e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 29 Sep 2025 17:37:51 -0700 Subject: [PATCH 362/759] Make stable release workflow callable. (#10108) --- .github/workflows/stable-release.yml | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index 1cbbfbf69..28484a9d1 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -2,6 +2,42 @@ name: "Release Stable Version" on: + workflow_call: + inputs: + git_tag: + description: 'Git tag' + required: true + type: string + cache_tag: + description: 'Cached dependencies tag' + required: true + type: string + default: "cu129" + python_minor: + description: 'Python minor version' + required: true + type: string + default: "13" + python_patch: + description: 'Python patch version' + required: true + type: string + default: "6" + rel_name: + description: 'Release name' + required: true + type: string + default: "nvidia" + rel_extra_name: + description: 'Release extra name' + required: false + type: string + default: "" + test_release: + description: 'Test Release' + required: true + type: boolean + default: true workflow_dispatch: inputs: git_tag: From 414a178fb690ef9998f65419f03ef1a83cf559de Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 29 Sep 2025 20:03:02 -0700 Subject: [PATCH 363/759] Add basic readme for AMD portable. (#10109) --- .../README_VERY_IMPORTANT.txt | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100755 .ci/windows_amd_base_files/README_VERY_IMPORTANT.txt diff --git a/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt new file mode 100755 index 000000000..570ac3398 --- /dev/null +++ b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt @@ -0,0 +1,24 @@ +As of the time of writing this you need this preview driver for best results: +https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOWS-PYTORCH-PREVIEW.html + +HOW TO RUN: + +if you have a AMD gpu: + +run_amd_gpu.bat + + +IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints + +You can download the stable diffusion XL one from: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors + + +RECOMMENDED WAY TO UPDATE: +To update the ComfyUI code: update\update_comfyui.bat + + +TO SHARE MODELS BETWEEN COMFYUI AND ANOTHER UI: +In the ComfyUI directory you will find a file: extra_model_paths.yaml.example +Rename this file to: extra_model_paths.yaml and edit it with your favorite text editor. + + From 977a4ed8c55ade53d0d6cfe1fe8a6396ee35a2ec Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 29 Sep 2025 23:04:42 -0400 Subject: [PATCH 364/759] ComfyUI version 0.3.61 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index d469a8194..737b72131 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.60" +__version__ = "0.3.61" diff --git a/pyproject.toml b/pyproject.toml index 7340c320b..e851560f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.60" +version = "0.3.61" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 6e079abc3a3fc0fb98e2a0848877874151310ed1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 29 Sep 2025 20:11:37 -0700 Subject: [PATCH 365/759] Workflow permission fix. (#10110) --- .github/workflows/release-stable-all.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/release-stable-all.yml b/.github/workflows/release-stable-all.yml index aac84d637..5c1024599 100644 --- a/.github/workflows/release-stable-all.yml +++ b/.github/workflows/release-stable-all.yml @@ -10,6 +10,10 @@ on: jobs: release_nvidia_default: + permissions: + contents: "write" + packages: "write" + pull-requests: "read" name: "Release NVIDIA Default (cu129)" uses: ./.github/workflows/stable-release.yml with: @@ -23,6 +27,10 @@ jobs: secrets: inherit release_nvidia_cu128: + permissions: + contents: "write" + packages: "write" + pull-requests: "read" name: "Release NVIDIA cu128" uses: ./.github/workflows/stable-release.yml with: @@ -36,6 +44,10 @@ jobs: secrets: inherit release_amd_rocm: + permissions: + contents: "write" + packages: "write" + pull-requests: "read" name: "Release AMD ROCm 6.4.4" uses: ./.github/workflows/stable-release.yml with: From f48d7230de2f7b10fe8bfda3d7f53241d19c7266 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 30 Sep 2025 09:17:49 -0700 Subject: [PATCH 366/759] Add new portable links to readme. (#10112) --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 5a257687b..8f24a33ee 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,12 @@ Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you If you have trouble extracting it, right click the file -> properties -> unblock +#### Alternative Downloads: + +[Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z) + +[Portable with pytorch cuda 12.8 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z) (Supports Nvidia 10 series and older GPUs). + #### How do I share models between another UI and ComfyUI? See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor. From 631b9ae861bf8bdd3c538da232e4c8938448e59d Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 30 Sep 2025 20:21:47 +0300 Subject: [PATCH 367/759] fix(Rodin3D-Gen2): missing "task_uuid" parameter (#10128) --- comfy_api_nodes/nodes_rodin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index 817efb0f5..633ac46d3 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -540,7 +540,7 @@ class Rodin3D_Gen2(Rodin3DAPI): **kwargs) await self.poll_for_task_status(subscription_key, **kwargs) download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list) + model = await self.download_files(download_list, task_uuid) return (model,) From b682a73c55a6434fdd9293d45ace969597f8ad65 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 30 Sep 2025 20:43:41 +0300 Subject: [PATCH 368/759] enable Seedance Pro model in the FirstLastFrame node (#10120) --- comfy_api_nodes/nodes_bytedance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index a7eeaf15a..654d6a362 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -920,7 +920,7 @@ class ByteDanceFirstLastFrameNode(comfy_io.ComfyNode): inputs=[ comfy_io.Combo.Input( "model", - options=[Image2VideoModelName.seedance_1_lite.value], + options=[model.value for model in Image2VideoModelName], default=Image2VideoModelName.seedance_1_lite.value, tooltip="Model name", ), From bab8ba20bf47d985d6b1d73627c2add76bd4e716 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 30 Sep 2025 15:12:07 -0400 Subject: [PATCH 369/759] ComfyUI version 0.3.62. --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 737b72131..ac76fbe35 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.61" +__version__ = "0.3.62" diff --git a/pyproject.toml b/pyproject.toml index e851560f7..d0a76c6d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.61" +version = "0.3.62" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From c4a8cf60ab5d6eaf052b7a08f5ee97104acf7a2f Mon Sep 17 00:00:00 2001 From: AustinMroz Date: Tue, 30 Sep 2025 22:12:32 -0700 Subject: [PATCH 370/759] Bump frontend to 1.27.7 (#10133) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 45d3e1607..588c5dcf0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.26.13 +comfyui-frontend-package==1.27.7 comfyui-workflow-templates==0.1.91 comfyui-embedded-docs==0.2.6 torch From 638097829d2352a1c78ab4fbb1e028d1e7cff012 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 1 Oct 2025 09:00:22 +0300 Subject: [PATCH 371/759] convert nodes_audio_encoder.py to V3 schema (#10123) --- comfy_api/latest/_io.py | 1 + comfy_extras/nodes_audio_encoder.py | 68 ++++++++++++++++++----------- 2 files changed, 44 insertions(+), 25 deletions(-) diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 4826818df..2d95cffd6 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -1605,6 +1605,7 @@ class _IO: Model = Model ClipVision = ClipVision ClipVisionOutput = ClipVisionOutput + AudioEncoder = AudioEncoder AudioEncoderOutput = AudioEncoderOutput StyleModel = StyleModel Gligen = Gligen diff --git a/comfy_extras/nodes_audio_encoder.py b/comfy_extras/nodes_audio_encoder.py index 39a140fef..13aacd41a 100644 --- a/comfy_extras/nodes_audio_encoder.py +++ b/comfy_extras/nodes_audio_encoder.py @@ -1,44 +1,62 @@ import folder_paths import comfy.audio_encoders.audio_encoders import comfy.utils +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class AudioEncoderLoader: +class AudioEncoderLoader(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "audio_encoder_name": (folder_paths.get_filename_list("audio_encoders"), ), - }} - RETURN_TYPES = ("AUDIO_ENCODER",) - FUNCTION = "load_model" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="AudioEncoderLoader", + category="loaders", + inputs=[ + io.Combo.Input( + "audio_encoder_name", + options=folder_paths.get_filename_list("audio_encoders"), + ), + ], + outputs=[io.AudioEncoder.Output()], + ) - CATEGORY = "loaders" - - def load_model(self, audio_encoder_name): + @classmethod + def execute(cls, audio_encoder_name) -> io.NodeOutput: audio_encoder_name = folder_paths.get_full_path_or_raise("audio_encoders", audio_encoder_name) sd = comfy.utils.load_torch_file(audio_encoder_name, safe_load=True) audio_encoder = comfy.audio_encoders.audio_encoders.load_audio_encoder_from_sd(sd) if audio_encoder is None: raise RuntimeError("ERROR: audio encoder file is invalid and does not contain a valid model.") - return (audio_encoder,) + return io.NodeOutput(audio_encoder) -class AudioEncoderEncode: +class AudioEncoderEncode(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "audio_encoder": ("AUDIO_ENCODER",), - "audio": ("AUDIO",), - }} - RETURN_TYPES = ("AUDIO_ENCODER_OUTPUT",) - FUNCTION = "encode" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="AudioEncoderEncode", + category="conditioning", + inputs=[ + io.AudioEncoder.Input("audio_encoder"), + io.Audio.Input("audio"), + ], + outputs=[io.AudioEncoderOutput.Output()], + ) - CATEGORY = "conditioning" - - def encode(self, audio_encoder, audio): + @classmethod + def execute(cls, audio_encoder, audio) -> io.NodeOutput: output = audio_encoder.encode_audio(audio["waveform"], audio["sample_rate"]) - return (output,) + return io.NodeOutput(output) -NODE_CLASS_MAPPINGS = { - "AudioEncoderLoader": AudioEncoderLoader, - "AudioEncoderEncode": AudioEncoderEncode, -} +class AudioEncoder(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + AudioEncoderLoader, + AudioEncoderEncode, + ] + + +async def comfy_entrypoint() -> AudioEncoder: + return AudioEncoder() From 7eb7160db487feb891ceabdf985b09f9a8091869 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 1 Oct 2025 22:16:59 +0300 Subject: [PATCH 372/759] convert nodes_gits.py to V3 schema (#9949) --- comfy_extras/nodes_gits.py | 49 ++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/comfy_extras/nodes_gits.py b/comfy_extras/nodes_gits.py index 47b1dd049..25367560a 100644 --- a/comfy_extras/nodes_gits.py +++ b/comfy_extras/nodes_gits.py @@ -1,6 +1,8 @@ # from https://github.com/zju-pi/diff-sampler/tree/main/gits-main import numpy as np import torch +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io def loglinear_interp(t_steps, num_steps): """ @@ -333,25 +335,28 @@ NOISE_LEVELS = { ], } -class GITSScheduler: +class GITSScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"coeff": ("FLOAT", {"default": 1.20, "min": 0.80, "max": 1.50, "step": 0.05}), - "steps": ("INT", {"default": 10, "min": 2, "max": 1000}), - "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" + def define_schema(cls): + return io.Schema( + node_id="GITSScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Float.Input("coeff", default=1.20, min=0.80, max=1.50, step=0.05), + io.Int.Input("steps", default=10, min=2, max=1000), + io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01), + ], + outputs=[ + io.Sigmas.Output(), + ], + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, coeff, steps, denoise): + @classmethod + def execute(cls, coeff, steps, denoise): total_steps = steps if denoise < 1.0: if denoise <= 0.0: - return (torch.FloatTensor([]),) + return io.NodeOutput(torch.FloatTensor([])) total_steps = round(steps * denoise) if steps <= 20: @@ -362,8 +367,16 @@ class GITSScheduler: sigmas = sigmas[-(total_steps + 1):] sigmas[-1] = 0 - return (torch.FloatTensor(sigmas), ) + return io.NodeOutput(torch.FloatTensor(sigmas)) -NODE_CLASS_MAPPINGS = { - "GITSScheduler": GITSScheduler, -} + +class GITSSchedulerExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + GITSScheduler, + ] + + +async def comfy_entrypoint() -> GITSSchedulerExtension: + return GITSSchedulerExtension() From e0210ce0a7140e0c61bce7fdb964b5e5e8d31619 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 1 Oct 2025 22:17:33 +0300 Subject: [PATCH 373/759] convert nodes_differential_diffusion.py to V3 schema (#10056) --- comfy_extras/nodes_differential_diffusion.py | 69 ++++++++++++-------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/comfy_extras/nodes_differential_diffusion.py b/comfy_extras/nodes_differential_diffusion.py index 255ac420d..6dfdf466c 100644 --- a/comfy_extras/nodes_differential_diffusion.py +++ b/comfy_extras/nodes_differential_diffusion.py @@ -1,34 +1,41 @@ # code adapted from https://github.com/exx8/differential-diffusion +from typing_extensions import override + import torch +from comfy_api.latest import ComfyExtension, io -class DifferentialDiffusion(): + +class DifferentialDiffusion(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ("MODEL", ), - }, - "optional": { - "strength": ("FLOAT", { - "default": 1.0, - "min": 0.0, - "max": 1.0, - "step": 0.01, - }), - } - } - RETURN_TYPES = ("MODEL",) - FUNCTION = "apply" - CATEGORY = "_for_testing" - INIT = False + def define_schema(cls): + return io.Schema( + node_id="DifferentialDiffusion", + display_name="Differential Diffusion", + category="_for_testing", + inputs=[ + io.Model.Input("model"), + io.Float.Input( + "strength", + default=1.0, + min=0.0, + max=1.0, + step=0.01, + optional=True, + ), + ], + outputs=[io.Model.Output()], + is_experimental=True, + ) - def apply(self, model, strength=1.0): + @classmethod + def execute(cls, model, strength=1.0) -> io.NodeOutput: model = model.clone() - model.set_model_denoise_mask_function(lambda *args, **kwargs: self.forward(*args, **kwargs, strength=strength)) - return (model, ) + model.set_model_denoise_mask_function(lambda *args, **kwargs: cls.forward(*args, **kwargs, strength=strength)) + return io.NodeOutput(model) - def forward(self, sigma: torch.Tensor, denoise_mask: torch.Tensor, extra_options: dict, strength: float): + @classmethod + def forward(cls, sigma: torch.Tensor, denoise_mask: torch.Tensor, extra_options: dict, strength: float): model = extra_options["model"] step_sigmas = extra_options["sigmas"] sigma_to = model.inner_model.model_sampling.sigma_min @@ -53,9 +60,13 @@ class DifferentialDiffusion(): return binary_mask -NODE_CLASS_MAPPINGS = { - "DifferentialDiffusion": DifferentialDiffusion, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "DifferentialDiffusion": "Differential Diffusion", -} +class DifferentialDiffusionExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + DifferentialDiffusion, + ] + + +async def comfy_entrypoint() -> DifferentialDiffusionExtension: + return DifferentialDiffusionExtension() From 3af1881455fb0c44c3030b2d61b79302933386d2 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 1 Oct 2025 22:18:04 +0300 Subject: [PATCH 374/759] convert nodes_optimalsteps.py to V3 schema (#10074) --- comfy_extras/nodes_optimalsteps.py | 52 +++++++++++++++++++----------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/comfy_extras/nodes_optimalsteps.py b/comfy_extras/nodes_optimalsteps.py index e7c851ca2..73f0104d8 100644 --- a/comfy_extras/nodes_optimalsteps.py +++ b/comfy_extras/nodes_optimalsteps.py @@ -1,9 +1,12 @@ # from https://github.com/bebebe666/OptimalSteps - import numpy as np import torch +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io + + def loglinear_interp(t_steps, num_steps): """ Performs log-linear interpolation of a given array of decreasing numbers. @@ -23,25 +26,28 @@ NOISE_LEVELS = {"FLUX": [0.9968, 0.9886, 0.9819, 0.975, 0.966, 0.9471, 0.9158, 0 "Chroma": [0.992, 0.99, 0.988, 0.985, 0.982, 0.978, 0.973, 0.968, 0.961, 0.953, 0.943, 0.931, 0.917, 0.9, 0.881, 0.858, 0.832, 0.802, 0.769, 0.731, 0.69, 0.646, 0.599, 0.55, 0.501, 0.451, 0.402, 0.355, 0.311, 0.27, 0.232, 0.199, 0.169, 0.143, 0.12, 0.101, 0.084, 0.07, 0.058, 0.048, 0.001], } -class OptimalStepsScheduler: +class OptimalStepsScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model_type": (["FLUX", "Wan", "Chroma"], ), - "steps": ("INT", {"default": 20, "min": 3, "max": 1000}), - "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" + def define_schema(cls): + return io.Schema( + node_id="OptimalStepsScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Combo.Input("model_type", options=["FLUX", "Wan", "Chroma"]), + io.Int.Input("steps", default=20, min=3, max=1000), + io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01), + ], + outputs=[ + io.Sigmas.Output(), + ], + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, model_type, steps, denoise): + @classmethod + def execute(cls, model_type, steps, denoise) ->io.NodeOutput: total_steps = steps if denoise < 1.0: if denoise <= 0.0: - return (torch.FloatTensor([]),) + return io.NodeOutput(torch.FloatTensor([])) total_steps = round(steps * denoise) sigmas = NOISE_LEVELS[model_type][:] @@ -50,8 +56,16 @@ class OptimalStepsScheduler: sigmas = sigmas[-(total_steps + 1):] sigmas[-1] = 0 - return (torch.FloatTensor(sigmas), ) + return io.NodeOutput(torch.FloatTensor(sigmas)) -NODE_CLASS_MAPPINGS = { - "OptimalStepsScheduler": OptimalStepsScheduler, -} + +class OptimalStepsExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + OptimalStepsScheduler, + ] + + +async def comfy_entrypoint() -> OptimalStepsExtension: + return OptimalStepsExtension() From 11bab7be76d0bfdb326e8aea53cdfebd99b42cc5 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 1 Oct 2025 22:18:49 +0300 Subject: [PATCH 375/759] convert nodes_pag.py to V3 schema (#10080) --- comfy_extras/nodes_pag.py | 49 +++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/comfy_extras/nodes_pag.py b/comfy_extras/nodes_pag.py index eb28196f4..79fea5f0c 100644 --- a/comfy_extras/nodes_pag.py +++ b/comfy_extras/nodes_pag.py @@ -3,25 +3,30 @@ #My modified one here is more basic but has less chances of breaking with ComfyUI updates. +from typing_extensions import override + import comfy.model_patcher import comfy.samplers +from comfy_api.latest import ComfyExtension, io -class PerturbedAttentionGuidance: + +class PerturbedAttentionGuidance(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ("MODEL",), - "scale": ("FLOAT", {"default": 3.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": 0.01}), - } - } + def define_schema(cls): + return io.Schema( + node_id="PerturbedAttentionGuidance", + category="model_patches/unet", + inputs=[ + io.Model.Input("model"), + io.Float.Input("scale", default=3.0, min=0.0, max=100.0, step=0.01, round=0.01), + ], + outputs=[ + io.Model.Output(), + ], + ) - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" - - CATEGORY = "model_patches/unet" - - def patch(self, model, scale): + @classmethod + def execute(cls, model, scale) -> io.NodeOutput: unet_block = "middle" unet_block_id = 0 m = model.clone() @@ -49,8 +54,16 @@ class PerturbedAttentionGuidance: m.set_model_sampler_post_cfg_function(post_cfg_function) - return (m,) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "PerturbedAttentionGuidance": PerturbedAttentionGuidance, -} + +class PAGExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + PerturbedAttentionGuidance, + ] + + +async def comfy_entrypoint() -> PAGExtension: + return PAGExtension() From d9c0a4053d955c7fd3400be07001bc4e774591e1 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 1 Oct 2025 22:19:56 +0300 Subject: [PATCH 376/759] convert nodes_lt.py to V3 schema (#10084) --- comfy_extras/nodes_lt.py | 412 ++++++++++++++++++++++----------------- 1 file changed, 228 insertions(+), 184 deletions(-) diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index f82337a67..b51d15804 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -1,4 +1,3 @@ -import io import nodes import node_helpers import torch @@ -8,46 +7,60 @@ import comfy.utils import math import numpy as np import av +from io import BytesIO +from typing_extensions import override from comfy.ldm.lightricks.symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords +from comfy_api.latest import ComfyExtension, io -class EmptyLTXVLatentVideo: +class EmptyLTXVLatentVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "width": ("INT", {"default": 768, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), - "height": ("INT", {"default": 512, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), - "length": ("INT", {"default": 97, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 8}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" + def define_schema(cls): + return io.Schema( + node_id="EmptyLTXVLatentVideo", + category="latent/video/ltxv", + inputs=[ + io.Int.Input("width", default=768, min=64, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("height", default=512, min=64, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("length", default=97, min=1, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(), + ], + ) - CATEGORY = "latent/video/ltxv" - - def generate(self, width, height, length, batch_size=1): + @classmethod + def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput: latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device()) - return ({"samples": latent}, ) + return io.NodeOutput({"samples": latent}) -class LTXVImgToVideo: +class LTXVImgToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE",), - "image": ("IMAGE",), - "width": ("INT", {"default": 768, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), - "height": ("INT", {"default": 512, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), - "length": ("INT", {"default": 97, "min": 9, "max": nodes.MAX_RESOLUTION, "step": 8}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0}), - }} + def define_schema(cls): + return io.Schema( + node_id="LTXVImgToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Image.Input("image"), + io.Int.Input("width", default=768, min=64, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("height", default=512, min=64, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("length", default=97, min=9, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Float.Input("strength", default=1.0, min=0.0, max=1.0), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - - CATEGORY = "conditioning/video_models" - FUNCTION = "generate" - - def generate(self, positive, negative, image, vae, width, height, length, batch_size, strength): + @classmethod + def execute(cls, positive, negative, image, vae, width, height, length, batch_size, strength) -> io.NodeOutput: pixels = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) encode_pixels = pixels[:, :, :, :3] t = vae.encode(encode_pixels) @@ -62,7 +75,7 @@ class LTXVImgToVideo: ) conditioning_latent_frames_mask[:, :, :t.shape[2]] = 1.0 - strength - return (positive, negative, {"samples": latent, "noise_mask": conditioning_latent_frames_mask}, ) + return io.NodeOutput(positive, negative, {"samples": latent, "noise_mask": conditioning_latent_frames_mask}) def conditioning_get_any_value(conditioning, key, default=None): @@ -93,35 +106,46 @@ def get_keyframe_idxs(cond): num_keyframes = torch.unique(keyframe_idxs[:, 0]).shape[0] return keyframe_idxs, num_keyframes -class LTXVAddGuide: +class LTXVAddGuide(io.ComfyNode): + NUM_PREFIX_FRAMES = 2 + PATCHIFIER = SymmetricPatchifier(1) + @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE",), - "latent": ("LATENT",), - "image": ("IMAGE", {"tooltip": "Image or video to condition the latent video on. Must be 8*n + 1 frames." - "If the video is not 8*n + 1 frames, it will be cropped to the nearest 8*n + 1 frames."}), - "frame_idx": ("INT", {"default": 0, "min": -9999, "max": 9999, - "tooltip": "Frame index to start the conditioning at. For single-frame images or " - "videos with 1-8 frames, any frame_idx value is acceptable. For videos with 9+ " - "frames, frame_idx must be divisible by 8, otherwise it will be rounded down to " - "the nearest multiple of 8. Negative values are counted from the end of the video."}), - "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - } - } + def define_schema(cls): + return io.Schema( + node_id="LTXVAddGuide", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Latent.Input("latent"), + io.Image.Input( + "image", + tooltip="Image or video to condition the latent video on. Must be 8*n + 1 frames. " + "If the video is not 8*n + 1 frames, it will be cropped to the nearest 8*n + 1 frames.", + ), + io.Int.Input( + "frame_idx", + default=0, + min=-9999, + max=9999, + tooltip="Frame index to start the conditioning at. " + "For single-frame images or videos with 1-8 frames, any frame_idx value is acceptable. " + "For videos with 9+ frames, frame_idx must be divisible by 8, otherwise it will be rounded " + "down to the nearest multiple of 8. Negative values are counted from the end of the video.", + ), + io.Float.Input("strength", default=1.0, min=0.0, max=1.0, step=0.01), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - - CATEGORY = "conditioning/video_models" - FUNCTION = "generate" - - def __init__(self): - self._num_prefix_frames = 2 - self._patchifier = SymmetricPatchifier(1) - - def encode(self, vae, latent_width, latent_height, images, scale_factors): + @classmethod + def encode(cls, vae, latent_width, latent_height, images, scale_factors): time_scale_factor, width_scale_factor, height_scale_factor = scale_factors images = images[:(images.shape[0] - 1) // time_scale_factor * time_scale_factor + 1] pixels = comfy.utils.common_upscale(images.movedim(-1, 1), latent_width * width_scale_factor, latent_height * height_scale_factor, "bilinear", crop="disabled").movedim(1, -1) @@ -129,7 +153,8 @@ class LTXVAddGuide: t = vae.encode(encode_pixels) return encode_pixels, t - def get_latent_index(self, cond, latent_length, guide_length, frame_idx, scale_factors): + @classmethod + def get_latent_index(cls, cond, latent_length, guide_length, frame_idx, scale_factors): time_scale_factor, _, _ = scale_factors _, num_keyframes = get_keyframe_idxs(cond) latent_count = latent_length - num_keyframes @@ -141,9 +166,10 @@ class LTXVAddGuide: return frame_idx, latent_idx - def add_keyframe_index(self, cond, frame_idx, guiding_latent, scale_factors): + @classmethod + def add_keyframe_index(cls, cond, frame_idx, guiding_latent, scale_factors): keyframe_idxs, _ = get_keyframe_idxs(cond) - _, latent_coords = self._patchifier.patchify(guiding_latent) + _, latent_coords = cls.PATCHIFIER.patchify(guiding_latent) pixel_coords = latent_to_pixel_coords(latent_coords, scale_factors, causal_fix=frame_idx == 0) # we need the causal fix only if we're placing the new latents at index 0 pixel_coords[:, 0] += frame_idx if keyframe_idxs is None: @@ -152,8 +178,9 @@ class LTXVAddGuide: keyframe_idxs = torch.cat([keyframe_idxs, pixel_coords], dim=2) return node_helpers.conditioning_set_values(cond, {"keyframe_idxs": keyframe_idxs}) - def append_keyframe(self, positive, negative, frame_idx, latent_image, noise_mask, guiding_latent, strength, scale_factors): - _, latent_idx = self.get_latent_index( + @classmethod + def append_keyframe(cls, positive, negative, frame_idx, latent_image, noise_mask, guiding_latent, strength, scale_factors): + _, latent_idx = cls.get_latent_index( cond=positive, latent_length=latent_image.shape[2], guide_length=guiding_latent.shape[2], @@ -162,8 +189,8 @@ class LTXVAddGuide: ) noise_mask[:, :, latent_idx:latent_idx + guiding_latent.shape[2]] = 1.0 - positive = self.add_keyframe_index(positive, frame_idx, guiding_latent, scale_factors) - negative = self.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors) + positive = cls.add_keyframe_index(positive, frame_idx, guiding_latent, scale_factors) + negative = cls.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors) mask = torch.full( (noise_mask.shape[0], 1, guiding_latent.shape[2], noise_mask.shape[3], noise_mask.shape[4]), @@ -176,7 +203,8 @@ class LTXVAddGuide: noise_mask = torch.cat([noise_mask, mask], dim=2) return positive, negative, latent_image, noise_mask - def replace_latent_frames(self, latent_image, noise_mask, guiding_latent, latent_idx, strength): + @classmethod + def replace_latent_frames(cls, latent_image, noise_mask, guiding_latent, latent_idx, strength): cond_length = guiding_latent.shape[2] assert latent_image.shape[2] >= latent_idx + cond_length, "Conditioning frames exceed the length of the latent sequence." @@ -195,20 +223,21 @@ class LTXVAddGuide: return latent_image, noise_mask - def generate(self, positive, negative, vae, latent, image, frame_idx, strength): + @classmethod + def execute(cls, positive, negative, vae, latent, image, frame_idx, strength) -> io.NodeOutput: scale_factors = vae.downscale_index_formula latent_image = latent["samples"] noise_mask = get_noise_mask(latent) _, _, latent_length, latent_height, latent_width = latent_image.shape - image, t = self.encode(vae, latent_width, latent_height, image, scale_factors) + image, t = cls.encode(vae, latent_width, latent_height, image, scale_factors) - frame_idx, latent_idx = self.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors) + frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors) assert latent_idx + t.shape[2] <= latent_length, "Conditioning frames exceed the length of the latent sequence." - num_prefix_frames = min(self._num_prefix_frames, t.shape[2]) + num_prefix_frames = min(cls.NUM_PREFIX_FRAMES, t.shape[2]) - positive, negative, latent_image, noise_mask = self.append_keyframe( + positive, negative, latent_image, noise_mask = cls.append_keyframe( positive, negative, frame_idx, @@ -223,9 +252,9 @@ class LTXVAddGuide: t = t[:, :, num_prefix_frames:] if t.shape[2] == 0: - return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},) + return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask}) - latent_image, noise_mask = self.replace_latent_frames( + latent_image, noise_mask = cls.replace_latent_frames( latent_image, noise_mask, t, @@ -233,34 +262,35 @@ class LTXVAddGuide: strength, ) - return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},) + return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask}) -class LTXVCropGuides: +class LTXVCropGuides(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "latent": ("LATENT",), - } - } + def define_schema(cls): + return io.Schema( + node_id="LTXVCropGuides", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Latent.Input("latent"), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - - CATEGORY = "conditioning/video_models" - FUNCTION = "crop" - - def __init__(self): - self._patchifier = SymmetricPatchifier(1) - - def crop(self, positive, negative, latent): + @classmethod + def execute(cls, positive, negative, latent) -> io.NodeOutput: latent_image = latent["samples"].clone() noise_mask = get_noise_mask(latent) _, num_keyframes = get_keyframe_idxs(positive) if num_keyframes == 0: - return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},) + return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask},) latent_image = latent_image[:, :, :-num_keyframes] noise_mask = noise_mask[:, :, :-num_keyframes] @@ -268,44 +298,52 @@ class LTXVCropGuides: positive = node_helpers.conditioning_set_values(positive, {"keyframe_idxs": None}) negative = node_helpers.conditioning_set_values(negative, {"keyframe_idxs": None}) - return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},) + return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask}) -class LTXVConditioning: +class LTXVConditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "frame_rate": ("FLOAT", {"default": 25.0, "min": 0.0, "max": 1000.0, "step": 0.01}), - }} - RETURN_TYPES = ("CONDITIONING", "CONDITIONING") - RETURN_NAMES = ("positive", "negative") - FUNCTION = "append" + def define_schema(cls): + return io.Schema( + node_id="LTXVConditioning", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Float.Input("frame_rate", default=25.0, min=0.0, max=1000.0, step=0.01), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + ], + ) - CATEGORY = "conditioning/video_models" - - def append(self, positive, negative, frame_rate): + @classmethod + def execute(cls, positive, negative, frame_rate) -> io.NodeOutput: positive = node_helpers.conditioning_set_values(positive, {"frame_rate": frame_rate}) negative = node_helpers.conditioning_set_values(negative, {"frame_rate": frame_rate}) - return (positive, negative) + return io.NodeOutput(positive, negative) -class ModelSamplingLTXV: +class ModelSamplingLTXV(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "max_shift": ("FLOAT", {"default": 2.05, "min": 0.0, "max": 100.0, "step":0.01}), - "base_shift": ("FLOAT", {"default": 0.95, "min": 0.0, "max": 100.0, "step":0.01}), - }, - "optional": {"latent": ("LATENT",), } - } + def define_schema(cls): + return io.Schema( + node_id="ModelSamplingLTXV", + category="advanced/model", + inputs=[ + io.Model.Input("model"), + io.Float.Input("max_shift", default=2.05, min=0.0, max=100.0, step=0.01), + io.Float.Input("base_shift", default=0.95, min=0.0, max=100.0, step=0.01), + io.Latent.Input("latent", optional=True), + ], + outputs=[ + io.Model.Output(), + ], + ) - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" - - CATEGORY = "advanced/model" - - def patch(self, model, max_shift, base_shift, latent=None): + @classmethod + def execute(cls, model, max_shift, base_shift, latent=None) -> io.NodeOutput: m = model.clone() if latent is None: @@ -329,37 +367,41 @@ class ModelSamplingLTXV: model_sampling.set_parameters(shift=shift) m.add_object_patch("model_sampling", model_sampling) - return (m, ) + return io.NodeOutput(m) -class LTXVScheduler: +class LTXVScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}), - "max_shift": ("FLOAT", {"default": 2.05, "min": 0.0, "max": 100.0, "step":0.01}), - "base_shift": ("FLOAT", {"default": 0.95, "min": 0.0, "max": 100.0, "step":0.01}), - "stretch": ("BOOLEAN", { - "default": True, - "tooltip": "Stretch the sigmas to be in the range [terminal, 1]." - }), - "terminal": ( - "FLOAT", - { - "default": 0.1, "min": 0.0, "max": 0.99, "step": 0.01, - "tooltip": "The terminal value of the sigmas after stretching." - }, - ), - }, - "optional": {"latent": ("LATENT",), } - } + def define_schema(cls): + return io.Schema( + node_id="LTXVScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Int.Input("steps", default=20, min=1, max=10000), + io.Float.Input("max_shift", default=2.05, min=0.0, max=100.0, step=0.01), + io.Float.Input("base_shift", default=0.95, min=0.0, max=100.0, step=0.01), + io.Boolean.Input( + id="stretch", + default=True, + tooltip="Stretch the sigmas to be in the range [terminal, 1].", + ), + io.Float.Input( + id="terminal", + default=0.1, + min=0.0, + max=0.99, + step=0.01, + tooltip="The terminal value of the sigmas after stretching.", + ), + io.Latent.Input("latent", optional=True), + ], + outputs=[ + io.Sigmas.Output(), + ], + ) - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" - - FUNCTION = "get_sigmas" - - def get_sigmas(self, steps, max_shift, base_shift, stretch, terminal, latent=None): + @classmethod + def execute(cls, steps, max_shift, base_shift, stretch, terminal, latent=None) -> io.NodeOutput: if latent is None: tokens = 4096 else: @@ -389,7 +431,7 @@ class LTXVScheduler: stretched = 1.0 - (one_minus_z / scale_factor) sigmas[non_zero_mask] = stretched - return (sigmas,) + return io.NodeOutput(sigmas) def encode_single_frame(output_file, image_array: np.ndarray, crf): container = av.open(output_file, "w", format="mp4") @@ -423,52 +465,54 @@ def preprocess(image: torch.Tensor, crf=29): return image image_array = (image[:(image.shape[0] // 2) * 2, :(image.shape[1] // 2) * 2] * 255.0).byte().cpu().numpy() - with io.BytesIO() as output_file: + with BytesIO() as output_file: encode_single_frame(output_file, image_array, crf) video_bytes = output_file.getvalue() - with io.BytesIO(video_bytes) as video_file: + with BytesIO(video_bytes) as video_file: image_array = decode_single_frame(video_file) tensor = torch.tensor(image_array, dtype=image.dtype, device=image.device) / 255.0 return tensor -class LTXVPreprocess: +class LTXVPreprocess(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - "img_compression": ( - "INT", - { - "default": 35, - "min": 0, - "max": 100, - "tooltip": "Amount of compression to apply on image.", - }, + def define_schema(cls): + return io.Schema( + node_id="LTXVPreprocess", + category="image", + inputs=[ + io.Image.Input("image"), + io.Int.Input( + id="img_compression", default=35, min=0, max=100, tooltip="Amount of compression to apply on image." ), - } - } + ], + outputs=[ + io.Image.Output(display_name="output_image"), + ], + ) - FUNCTION = "preprocess" - RETURN_TYPES = ("IMAGE",) - RETURN_NAMES = ("output_image",) - CATEGORY = "image" - - def preprocess(self, image, img_compression): + @classmethod + def execute(cls, image, img_compression) -> io.NodeOutput: output_images = [] for i in range(image.shape[0]): output_images.append(preprocess(image[i], img_compression)) - return (torch.stack(output_images),) + return io.NodeOutput(torch.stack(output_images)) -NODE_CLASS_MAPPINGS = { - "EmptyLTXVLatentVideo": EmptyLTXVLatentVideo, - "LTXVImgToVideo": LTXVImgToVideo, - "ModelSamplingLTXV": ModelSamplingLTXV, - "LTXVConditioning": LTXVConditioning, - "LTXVScheduler": LTXVScheduler, - "LTXVAddGuide": LTXVAddGuide, - "LTXVPreprocess": LTXVPreprocess, - "LTXVCropGuides": LTXVCropGuides, -} +class LtxvExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + EmptyLTXVLatentVideo, + LTXVImgToVideo, + ModelSamplingLTXV, + LTXVConditioning, + LTXVScheduler, + LTXVAddGuide, + LTXVPreprocess, + LTXVCropGuides, + ] + + +async def comfy_entrypoint() -> LtxvExtension: + return LtxvExtension() From e4f99b479a19730bea890567129f4032b4dd4787 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 1 Oct 2025 22:20:30 +0300 Subject: [PATCH 377/759] convert nodes_ip2p.pt to V3 schema (#10097) --- comfy_extras/nodes_ip2p.py | 54 +++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/comfy_extras/nodes_ip2p.py b/comfy_extras/nodes_ip2p.py index c2e70a84c..78f29915d 100644 --- a/comfy_extras/nodes_ip2p.py +++ b/comfy_extras/nodes_ip2p.py @@ -1,21 +1,30 @@ import torch -class InstructPixToPixConditioning: +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io + + +class InstructPixToPixConditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "pixels": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="InstructPixToPixConditioning", + category="conditioning/instructpix2pix", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Image.Input("pixels"), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING","CONDITIONING","LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/instructpix2pix" - - def encode(self, positive, negative, pixels, vae): + @classmethod + def execute(cls, positive, negative, pixels, vae) -> io.NodeOutput: x = (pixels.shape[1] // 8) * 8 y = (pixels.shape[2] // 8) * 8 @@ -38,8 +47,17 @@ class InstructPixToPixConditioning: n = [t[0], d] c.append(n) out.append(c) - return (out[0], out[1], out_latent) + return io.NodeOutput(out[0], out[1], out_latent) + + +class InstructPix2PixExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + InstructPixToPixConditioning, + ] + + +async def comfy_entrypoint() -> InstructPix2PixExtension: + return InstructPix2PixExtension() -NODE_CLASS_MAPPINGS = { - "InstructPixToPixConditioning": InstructPixToPixConditioning, -} From a6f83a4a1a70d720c16d66feb5d87fee5998acdf Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 1 Oct 2025 14:19:13 -0700 Subject: [PATCH 378/759] Support the new hunyuan vae. (#10150) --- comfy/ldm/hunyuan_video/vae_refiner.py | 112 ++++++++++++++++--------- comfy/sd.py | 70 ++++++++++------ 2 files changed, 116 insertions(+), 66 deletions(-) diff --git a/comfy/ldm/hunyuan_video/vae_refiner.py b/comfy/ldm/hunyuan_video/vae_refiner.py index c6f742710..c2a0b507d 100644 --- a/comfy/ldm/hunyuan_video/vae_refiner.py +++ b/comfy/ldm/hunyuan_video/vae_refiner.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d +from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d, Normalize import comfy.ops import comfy.ldm.models.autoencoder ops = comfy.ops.disable_weight_init @@ -17,11 +17,12 @@ class RMS_norm(nn.Module): return F.normalize(x, dim=1) * self.scale * self.gamma class DnSmpl(nn.Module): - def __init__(self, ic, oc, tds=True): + def __init__(self, ic, oc, tds=True, refiner_vae=True, op=VideoConv3d): super().__init__() fct = 2 * 2 * 2 if tds else 1 * 2 * 2 assert oc % fct == 0 - self.conv = VideoConv3d(ic, oc // fct, kernel_size=3) + self.conv = op(ic, oc // fct, kernel_size=3, stride=1, padding=1) + self.refiner_vae = refiner_vae self.tds = tds self.gs = fct * ic // oc @@ -30,7 +31,7 @@ class DnSmpl(nn.Module): r1 = 2 if self.tds else 1 h = self.conv(x) - if self.tds: + if self.tds and self.refiner_vae: hf = h[:, :, :1, :, :] b, c, f, ht, wd = hf.shape hf = hf.reshape(b, c, f, ht // 2, 2, wd // 2, 2) @@ -66,6 +67,7 @@ class DnSmpl(nn.Module): sc = torch.cat([xf, xn], dim=2) else: b, c, frms, ht, wd = h.shape + nf = frms // r1 h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2) h = h.permute(0, 3, 5, 7, 1, 2, 4, 6) @@ -83,10 +85,11 @@ class DnSmpl(nn.Module): class UpSmpl(nn.Module): - def __init__(self, ic, oc, tus=True): + def __init__(self, ic, oc, tus=True, refiner_vae=True, op=VideoConv3d): super().__init__() fct = 2 * 2 * 2 if tus else 1 * 2 * 2 - self.conv = VideoConv3d(ic, oc * fct, kernel_size=3) + self.conv = op(ic, oc * fct, kernel_size=3, stride=1, padding=1) + self.refiner_vae = refiner_vae self.tus = tus self.rp = fct * oc // ic @@ -95,7 +98,7 @@ class UpSmpl(nn.Module): r1 = 2 if self.tus else 1 h = self.conv(x) - if self.tus: + if self.tus and self.refiner_vae: hf = h[:, :, :1, :, :] b, c, f, ht, wd = hf.shape nc = c // (2 * 2) @@ -148,43 +151,56 @@ class UpSmpl(nn.Module): class Encoder(nn.Module): def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks, - ffactor_spatial, ffactor_temporal, downsample_match_channel=True, **_): + ffactor_spatial, ffactor_temporal, downsample_match_channel=True, refiner_vae=True, **_): super().__init__() self.z_channels = z_channels self.block_out_channels = block_out_channels self.num_res_blocks = num_res_blocks - self.conv_in = VideoConv3d(in_channels, block_out_channels[0], 3, 1, 1) + self.ffactor_temporal = ffactor_temporal + + self.refiner_vae = refiner_vae + if self.refiner_vae: + conv_op = VideoConv3d + norm_op = RMS_norm + else: + conv_op = ops.Conv3d + norm_op = Normalize + + self.conv_in = conv_op(in_channels, block_out_channels[0], 3, 1, 1) self.down = nn.ModuleList() ch = block_out_channels[0] depth = (ffactor_spatial >> 1).bit_length() - depth_temporal = ((ffactor_spatial // ffactor_temporal) >> 1).bit_length() + depth_temporal = ((ffactor_spatial // self.ffactor_temporal) >> 1).bit_length() for i, tgt in enumerate(block_out_channels): stage = nn.Module() stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, out_channels=tgt, temb_channels=0, - conv_op=VideoConv3d, norm_op=RMS_norm) + conv_op=conv_op, norm_op=norm_op) for j in range(num_res_blocks)]) ch = tgt if i < depth: nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and downsample_match_channel else ch - stage.downsample = DnSmpl(ch, nxt, tds=i >= depth_temporal) + stage.downsample = DnSmpl(ch, nxt, tds=i >= depth_temporal, refiner_vae=self.refiner_vae, op=conv_op) ch = nxt self.down.append(stage) self.mid = nn.Module() - self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) - self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=RMS_norm) - self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) + self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op) + self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op) + self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op) - self.norm_out = RMS_norm(ch) - self.conv_out = VideoConv3d(ch, z_channels << 1, 3, 1, 1) + self.norm_out = norm_op(ch) + self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1) self.regul = comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer() def forward(self, x): + if not self.refiner_vae and x.shape[2] == 1: + x = x.expand(-1, -1, self.ffactor_temporal, -1, -1) + x = self.conv_in(x) for stage in self.down: @@ -200,31 +216,42 @@ class Encoder(nn.Module): skip = x.view(b, c // grp, grp, t, h, w).mean(2) out = self.conv_out(F.silu(self.norm_out(x))) + skip - out = self.regul(out)[0] - out = torch.cat((out[:, :, :1], out), dim=2) - out = out.permute(0, 2, 1, 3, 4) - b, f_times_2, c, h, w = out.shape - out = out.reshape(b, f_times_2 // 2, 2 * c, h, w) - out = out.permute(0, 2, 1, 3, 4).contiguous() + if self.refiner_vae: + out = self.regul(out)[0] + + out = torch.cat((out[:, :, :1], out), dim=2) + out = out.permute(0, 2, 1, 3, 4) + b, f_times_2, c, h, w = out.shape + out = out.reshape(b, f_times_2 // 2, 2 * c, h, w) + out = out.permute(0, 2, 1, 3, 4).contiguous() + return out class Decoder(nn.Module): def __init__(self, z_channels, out_channels, block_out_channels, num_res_blocks, - ffactor_spatial, ffactor_temporal, upsample_match_channel=True, **_): + ffactor_spatial, ffactor_temporal, upsample_match_channel=True, refiner_vae=True, **_): super().__init__() block_out_channels = block_out_channels[::-1] self.z_channels = z_channels self.block_out_channels = block_out_channels self.num_res_blocks = num_res_blocks + self.refiner_vae = refiner_vae + if self.refiner_vae: + conv_op = VideoConv3d + norm_op = RMS_norm + else: + conv_op = ops.Conv3d + norm_op = Normalize + ch = block_out_channels[0] - self.conv_in = VideoConv3d(z_channels, ch, 3) + self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1) self.mid = nn.Module() - self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) - self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=RMS_norm) - self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) + self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op) + self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op) + self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op) self.up = nn.ModuleList() depth = (ffactor_spatial >> 1).bit_length() @@ -235,25 +262,26 @@ class Decoder(nn.Module): stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, out_channels=tgt, temb_channels=0, - conv_op=VideoConv3d, norm_op=RMS_norm) + conv_op=conv_op, norm_op=norm_op) for j in range(num_res_blocks + 1)]) ch = tgt if i < depth: nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and upsample_match_channel else ch - stage.upsample = UpSmpl(ch, nxt, tus=i < depth_temporal) + stage.upsample = UpSmpl(ch, nxt, tus=i < depth_temporal, refiner_vae=self.refiner_vae, op=conv_op) ch = nxt self.up.append(stage) - self.norm_out = RMS_norm(ch) - self.conv_out = VideoConv3d(ch, out_channels, 3) + self.norm_out = norm_op(ch) + self.conv_out = conv_op(ch, out_channels, 3, stride=1, padding=1) def forward(self, z): - z = z.permute(0, 2, 1, 3, 4) - b, f, c, h, w = z.shape - z = z.reshape(b, f, 2, c // 2, h, w) - z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w) - z = z.permute(0, 2, 1, 3, 4) - z = z[:, :, 1:] + if self.refiner_vae: + z = z.permute(0, 2, 1, 3, 4) + b, f, c, h, w = z.shape + z = z.reshape(b, f, 2, c // 2, h, w) + z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w) + z = z.permute(0, 2, 1, 3, 4) + z = z[:, :, 1:] x = self.conv_in(z) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1) x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x))) @@ -264,4 +292,10 @@ class Decoder(nn.Module): if hasattr(stage, 'upsample'): x = stage.upsample(x) - return self.conv_out(F.silu(self.norm_out(x))) + out = self.conv_out(F.silu(self.norm_out(x))) + + if not self.refiner_vae: + if z.shape[-3] == 1: + out = out[:, :, -1:] + + return out diff --git a/comfy/sd.py b/comfy/sd.py index 2df340739..873ad20f2 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -332,35 +332,51 @@ class VAE: self.first_stage_model = StageC_coder() self.downscale_ratio = 32 self.latent_channels = 16 - elif "decoder.conv_in.weight" in sd and sd['decoder.conv_in.weight'].shape[1] == 64: - ddconfig = {"block_out_channels": [128, 256, 512, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 32, "downsample_match_channel": True, "upsample_match_channel": True} - self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] - self.downscale_ratio = 32 - self.upscale_ratio = 32 - self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] - self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"}, - encoder_config={'target': "comfy.ldm.hunyuan_video.vae.Encoder", 'params': ddconfig}, - decoder_config={'target': "comfy.ldm.hunyuan_video.vae.Decoder", 'params': ddconfig}) - - self.memory_used_encode = lambda shape, dtype: (700 * shape[2] * shape[3]) * model_management.dtype_size(dtype) - self.memory_used_decode = lambda shape, dtype: (700 * shape[2] * shape[3] * 32 * 32) * model_management.dtype_size(dtype) - elif "decoder.conv_in.weight" in sd: - #default SD1.x/SD2.x VAE parameters - ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0} - - if 'encoder.down.2.downsample.conv.weight' not in sd and 'decoder.up.3.upsample.conv.weight' not in sd: #Stable diffusion x4 upscaler VAE - ddconfig['ch_mult'] = [1, 2, 4] - self.downscale_ratio = 4 - self.upscale_ratio = 4 - - self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] - if 'post_quant_conv.weight' in sd: - self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1]) - else: + if sd['decoder.conv_in.weight'].shape[1] == 64: + ddconfig = {"block_out_channels": [128, 256, 512, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 32, "downsample_match_channel": True, "upsample_match_channel": True} + self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] + self.downscale_ratio = 32 + self.upscale_ratio = 32 + self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"}, - encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': ddconfig}, - decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': ddconfig}) + encoder_config={'target': "comfy.ldm.hunyuan_video.vae.Encoder", 'params': ddconfig}, + decoder_config={'target': "comfy.ldm.hunyuan_video.vae.Decoder", 'params': ddconfig}) + + self.memory_used_encode = lambda shape, dtype: (700 * shape[2] * shape[3]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (700 * shape[2] * shape[3] * 32 * 32) * model_management.dtype_size(dtype) + elif sd['decoder.conv_in.weight'].shape[1] == 32: + ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True, "refiner_vae": False} + self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] + self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] + self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16) + self.upscale_index_formula = (4, 16, 16) + self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16) + self.downscale_index_formula = (4, 16, 16) + self.latent_dim = 3 + self.not_video = True + self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"}, + encoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Encoder", 'params': ddconfig}, + decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig}) + + self.memory_used_encode = lambda shape, dtype: (2800 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (2800 * shape[-3] * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype) + else: + #default SD1.x/SD2.x VAE parameters + ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0} + + if 'encoder.down.2.downsample.conv.weight' not in sd and 'decoder.up.3.upsample.conv.weight' not in sd: #Stable diffusion x4 upscaler VAE + ddconfig['ch_mult'] = [1, 2, 4] + self.downscale_ratio = 4 + self.upscale_ratio = 4 + + self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] + if 'post_quant_conv.weight' in sd: + self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1]) + else: + self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"}, + encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': ddconfig}, + decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': ddconfig}) elif "decoder.layers.1.layers.0.beta" in sd: self.first_stage_model = AudioOobleckVAE() self.memory_used_encode = lambda shape, dtype: (1000 * shape[2]) * model_management.dtype_size(dtype) From bb32d4ec3141333df26fcdaee0c3c08e41b7b249 Mon Sep 17 00:00:00 2001 From: Koratahiu Date: Thu, 2 Oct 2025 00:59:07 +0300 Subject: [PATCH 379/759] feat: Add Epsilon Scaling node for exposure bias correction (#10132) --- comfy_extras/nodes_eps.py | 60 +++++++++++++++++++++++++++++++++++++++ nodes.py | 1 + 2 files changed, 61 insertions(+) create mode 100644 comfy_extras/nodes_eps.py diff --git a/comfy_extras/nodes_eps.py b/comfy_extras/nodes_eps.py new file mode 100644 index 000000000..c8818f096 --- /dev/null +++ b/comfy_extras/nodes_eps.py @@ -0,0 +1,60 @@ +class EpsilonScaling: + """ + Implements the Epsilon Scaling method from 'Elucidating the Exposure Bias in Diffusion Models' + (https://arxiv.org/abs/2308.15321v6). + + This method mitigates exposure bias by scaling the predicted noise during sampling, + which can significantly improve sample quality. This implementation uses the "uniform schedule" + recommended by the paper for its practicality and effectiveness. + """ + @classmethod + def INPUT_TYPES(s): + return { + "required": { + "model": ("MODEL",), + "scaling_factor": ("FLOAT", { + "default": 1.005, + "min": 0.5, + "max": 1.5, + "step": 0.001, + "display": "number" + }), + } + } + + RETURN_TYPES = ("MODEL",) + FUNCTION = "patch" + + CATEGORY = "model_patches/unet" + + def patch(self, model, scaling_factor): + # Prevent division by zero, though the UI's min value should prevent this. + if scaling_factor == 0: + scaling_factor = 1e-9 + + def epsilon_scaling_function(args): + """ + This function is applied after the CFG guidance has been calculated. + It recalculates the denoised latent by scaling the predicted noise. + """ + denoised = args["denoised"] + x = args["input"] + + noise_pred = x - denoised + + scaled_noise_pred = noise_pred / scaling_factor + + new_denoised = x - scaled_noise_pred + + return new_denoised + + # Clone the model patcher to avoid modifying the original model in place + model_clone = model.clone() + + model_clone.set_model_sampler_post_cfg_function(epsilon_scaling_function) + + return (model_clone,) + +NODE_CLASS_MAPPINGS = { + "Epsilon Scaling": EpsilonScaling +} diff --git a/nodes.py b/nodes.py index 1a6784b68..88d712993 100644 --- a/nodes.py +++ b/nodes.py @@ -2297,6 +2297,7 @@ async def init_builtin_extra_nodes(): "nodes_gits.py", "nodes_controlnet.py", "nodes_hunyuan.py", + "nodes_eps.py", "nodes_flux.py", "nodes_lora_extract.py", "nodes_torch_compile.py", From 911331c06c16aa80633c5438c58edb32dbfdff50 Mon Sep 17 00:00:00 2001 From: rattus128 <46076784+rattus128@users.noreply.github.com> Date: Thu, 2 Oct 2025 08:40:28 +1000 Subject: [PATCH 380/759] sd: fix VAE tiled fallback VRAM leak (#10139) When the VAE catches this VRAM OOM, it launches the fallback logic straight from the exception context. Python however refs the entire call stack that caused the exception including any local variables for the sake of exception report and debugging. In the case of tensors, this can hold on the references to GBs of VRAM and inhibit the VRAM allocated from freeing them. So dump the except context completely before going back to the VAE via the tiler by getting out of the except block with nothing but a flag. The greately increases the reliability of the tiler fallback, especially on low VRAM cards, as with the bug, if the leak randomly leaked more than the headroom needed for a single tile, the tiler would fallback would OOM and fail the flow. --- comfy/sd.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/comfy/sd.py b/comfy/sd.py index 873ad20f2..be225ad03 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -652,6 +652,7 @@ class VAE: def decode(self, samples_in, vae_options={}): self.throw_exception_if_invalid() pixel_samples = None + do_tile = False try: memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype) model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) @@ -667,6 +668,13 @@ class VAE: pixel_samples[x:x+batch_number] = out except model_management.OOM_EXCEPTION: logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.") + #NOTE: We don't know what tensors were allocated to stack variables at the time of the + #exception and the exception itself refs them all until we get out of this except block. + #So we just set a flag for tiler fallback so that tensor gc can happen once the + #exception is fully off the books. + do_tile = True + + if do_tile: dims = samples_in.ndim - 2 if dims == 1 or self.extra_1d_channel is not None: pixel_samples = self.decode_tiled_1d(samples_in) @@ -713,6 +721,7 @@ class VAE: self.throw_exception_if_invalid() pixel_samples = self.vae_encode_crop_pixels(pixel_samples) pixel_samples = pixel_samples.movedim(-1, 1) + do_tile = False if self.latent_dim == 3 and pixel_samples.ndim < 5: if not self.not_video: pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) @@ -734,6 +743,13 @@ class VAE: except model_management.OOM_EXCEPTION: logging.warning("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.") + #NOTE: We don't know what tensors were allocated to stack variables at the time of the + #exception and the exception itself refs them all until we get out of this except block. + #So we just set a flag for tiler fallback so that tensor gc can happen once the + #exception is fully off the books. + do_tile = True + + if do_tile: if self.latent_dim == 3: tile = 256 overlap = tile // 4 From 4965c0e2acf39d84e82cb63dd6cc4400299d0a61 Mon Sep 17 00:00:00 2001 From: rattus128 <46076784+rattus128@users.noreply.github.com> Date: Thu, 2 Oct 2025 08:42:16 +1000 Subject: [PATCH 381/759] WAN: Fix cache VRAM leak on error (#10141) If this suffers an exception (such as a VRAM oom) it will leave the encode() and decode() methods which skips the cleanup of the WAN feature cache. The comfy node cache then ultimately keeps a reference this object which is in turn reffing large tensors from the failed execution. The feature cache is currently setup at a class variable on the encoder/decoder however, the encode and decode functions always clear it on both entry and exit of normal execution. Its likely the design intent is this is usable as a streaming encoder where the input comes in batches, however the functions as they are today don't support that. So simplify by bringing the cache back to local variable, so that if it does VRAM OOM the cache itself is properly garbage when the encode()/decode() functions dissappear from the stack. --- comfy/ldm/wan/vae.py | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/comfy/ldm/wan/vae.py b/comfy/ldm/wan/vae.py index 791596938..ccbb25822 100644 --- a/comfy/ldm/wan/vae.py +++ b/comfy/ldm/wan/vae.py @@ -468,55 +468,46 @@ class WanVAE(nn.Module): attn_scales, self.temperal_upsample, dropout) def encode(self, x): - self.clear_cache() + conv_idx = [0] + feat_map = [None] * count_conv3d(self.decoder) ## cache t = x.shape[2] iter_ = 1 + (t - 1) // 4 ## 对encode输入的x,按时间拆分为1、4、4、4.... for i in range(iter_): - self._enc_conv_idx = [0] + conv_idx = [0] if i == 0: out = self.encoder( x[:, :, :1, :, :], - feat_cache=self._enc_feat_map, - feat_idx=self._enc_conv_idx) + feat_cache=feat_map, + feat_idx=conv_idx) else: out_ = self.encoder( x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :], - feat_cache=self._enc_feat_map, - feat_idx=self._enc_conv_idx) + feat_cache=feat_map, + feat_idx=conv_idx) out = torch.cat([out, out_], 2) mu, log_var = self.conv1(out).chunk(2, dim=1) - self.clear_cache() return mu def decode(self, z): - self.clear_cache() + conv_idx = [0] + feat_map = [None] * count_conv3d(self.decoder) # z: [b,c,t,h,w] iter_ = z.shape[2] x = self.conv2(z) for i in range(iter_): - self._conv_idx = [0] + conv_idx = [0] if i == 0: out = self.decoder( x[:, :, i:i + 1, :, :], - feat_cache=self._feat_map, - feat_idx=self._conv_idx) + feat_cache=feat_map, + feat_idx=conv_idx) else: out_ = self.decoder( x[:, :, i:i + 1, :, :], - feat_cache=self._feat_map, - feat_idx=self._conv_idx) + feat_cache=feat_map, + feat_idx=conv_idx) out = torch.cat([out, out_], 2) - self.clear_cache() return out - - def clear_cache(self): - self._conv_num = count_conv3d(self.decoder) - self._conv_idx = [0] - self._feat_map = [None] * self._conv_num - #cache encode - self._enc_conv_num = count_conv3d(self.encoder) - self._enc_conv_idx = [0] - self._enc_feat_map = [None] * self._enc_conv_num From 0e9d1724be327c79ba86159d868f0b57adb8c384 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 1 Oct 2025 21:33:05 -0700 Subject: [PATCH 382/759] Add a .bat to the AMD portable to disable smart memory. (#10153) --- .ci/windows_amd_base_files/README_VERY_IMPORTANT.txt | 5 ++++- .../run_amd_gpu_disable_smart_memory.bat | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100755 .ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat diff --git a/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt index 570ac3398..96a500be2 100755 --- a/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt +++ b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt @@ -3,10 +3,13 @@ https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOW HOW TO RUN: -if you have a AMD gpu: +If you have a AMD gpu: run_amd_gpu.bat +If you have memory issues you can try disabling the smart memory management by running comfyui with: + +run_amd_gpu_disable_smart_memory.bat IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints diff --git a/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat b/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat new file mode 100755 index 000000000..cece0aeb2 --- /dev/null +++ b/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat @@ -0,0 +1,2 @@ +.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --disable-smart-memory +pause From 8f4ee9984c0c3864290e4fea81cfea2ba281717d Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 2 Oct 2025 23:53:00 +0300 Subject: [PATCH 383/759] convert nodes_morphology.py to V3 schema (#10159) --- comfy_extras/nodes_morphology.py | 116 +++++++++++++++++++------------ 1 file changed, 70 insertions(+), 46 deletions(-) diff --git a/comfy_extras/nodes_morphology.py b/comfy_extras/nodes_morphology.py index 075b26c40..67377e1bc 100644 --- a/comfy_extras/nodes_morphology.py +++ b/comfy_extras/nodes_morphology.py @@ -1,24 +1,34 @@ import torch import comfy.model_management +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io from kornia.morphology import dilation, erosion, opening, closing, gradient, top_hat, bottom_hat import kornia.color -class Morphology: +class Morphology(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"image": ("IMAGE",), - "operation": (["erode", "dilate", "open", "close", "gradient", "bottom_hat", "top_hat"],), - "kernel_size": ("INT", {"default": 3, "min": 3, "max": 999, "step": 1}), - }} + def define_schema(cls): + return io.Schema( + node_id="Morphology", + display_name="ImageMorphology", + category="image/postprocessing", + inputs=[ + io.Image.Input("image"), + io.Combo.Input( + "operation", + options=["erode", "dilate", "open", "close", "gradient", "bottom_hat", "top_hat"], + ), + io.Int.Input("kernel_size", default=3, min=3, max=999, step=1), + ], + outputs=[ + io.Image.Output(), + ], + ) - RETURN_TYPES = ("IMAGE",) - FUNCTION = "process" - - CATEGORY = "image/postprocessing" - - def process(self, image, operation, kernel_size): + @classmethod + def execute(cls, image, operation, kernel_size) -> io.NodeOutput: device = comfy.model_management.get_torch_device() kernel = torch.ones(kernel_size, kernel_size, device=device) image_k = image.to(device).movedim(-1, 1) @@ -39,49 +49,63 @@ class Morphology: else: raise ValueError(f"Invalid operation {operation} for morphology. Must be one of 'erode', 'dilate', 'open', 'close', 'gradient', 'tophat', 'bottomhat'") img_out = output.to(comfy.model_management.intermediate_device()).movedim(1, -1) - return (img_out,) + return io.NodeOutput(img_out) -class ImageRGBToYUV: +class ImageRGBToYUV(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "image": ("IMAGE",), - }} + def define_schema(cls): + return io.Schema( + node_id="ImageRGBToYUV", + category="image/batch", + inputs=[ + io.Image.Input("image"), + ], + outputs=[ + io.Image.Output(display_name="Y"), + io.Image.Output(display_name="U"), + io.Image.Output(display_name="V"), + ], + ) - RETURN_TYPES = ("IMAGE", "IMAGE", "IMAGE") - RETURN_NAMES = ("Y", "U", "V") - FUNCTION = "execute" - - CATEGORY = "image/batch" - - def execute(self, image): + @classmethod + def execute(cls, image) -> io.NodeOutput: out = kornia.color.rgb_to_ycbcr(image.movedim(-1, 1)).movedim(1, -1) - return (out[..., 0:1].expand_as(image), out[..., 1:2].expand_as(image), out[..., 2:3].expand_as(image)) + return io.NodeOutput(out[..., 0:1].expand_as(image), out[..., 1:2].expand_as(image), out[..., 2:3].expand_as(image)) -class ImageYUVToRGB: +class ImageYUVToRGB(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"Y": ("IMAGE",), - "U": ("IMAGE",), - "V": ("IMAGE",), - }} + def define_schema(cls): + return io.Schema( + node_id="ImageYUVToRGB", + category="image/batch", + inputs=[ + io.Image.Input("Y"), + io.Image.Input("U"), + io.Image.Input("V"), + ], + outputs=[ + io.Image.Output(), + ], + ) - RETURN_TYPES = ("IMAGE",) - FUNCTION = "execute" - - CATEGORY = "image/batch" - - def execute(self, Y, U, V): + @classmethod + def execute(cls, Y, U, V) -> io.NodeOutput: image = torch.cat([torch.mean(Y, dim=-1, keepdim=True), torch.mean(U, dim=-1, keepdim=True), torch.mean(V, dim=-1, keepdim=True)], dim=-1) out = kornia.color.ycbcr_to_rgb(image.movedim(-1, 1)).movedim(1, -1) - return (out,) + return io.NodeOutput(out) -NODE_CLASS_MAPPINGS = { - "Morphology": Morphology, - "ImageRGBToYUV": ImageRGBToYUV, - "ImageYUVToRGB": ImageYUVToRGB, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "Morphology": "ImageMorphology", -} +class MorphologyExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + Morphology, + ImageRGBToYUV, + ImageYUVToRGB, + ] + + +async def comfy_entrypoint() -> MorphologyExtension: + return MorphologyExtension() + From f6e3e9a456127a7e539929f42ea6cac838197879 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 3 Oct 2025 00:50:31 +0300 Subject: [PATCH 384/759] fix(api-nodes): made logging path to be smaller (#10156) --- comfy_api_nodes/apis/client.py | 5 +- comfy_api_nodes/apis/request_logger.py | 72 ++++++++++++++++++++------ 2 files changed, 59 insertions(+), 18 deletions(-) diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py index 0aed906fb..18a694675 100644 --- a/comfy_api_nodes/apis/client.py +++ b/comfy_api_nodes/apis/client.py @@ -95,6 +95,7 @@ import aiohttp import asyncio import logging import io +import os import socket from aiohttp.client_exceptions import ClientError, ClientResponseError from typing import Dict, Type, Optional, Any, TypeVar, Generic, Callable, Tuple @@ -499,7 +500,9 @@ class ApiClient: else: raise ValueError("File must be BytesIO or str path") - operation_id = f"upload_{upload_url.split('/')[-1]}_{uuid.uuid4().hex[:8]}" + parsed = urlparse(upload_url) + basename = os.path.basename(parsed.path) or parsed.netloc or "upload" + operation_id = f"upload_{basename}_{uuid.uuid4().hex[:8]}" request_logger.log_request_response( operation_id=operation_id, request_method="PUT", diff --git a/comfy_api_nodes/apis/request_logger.py b/comfy_api_nodes/apis/request_logger.py index 42901e141..2e0ca5380 100644 --- a/comfy_api_nodes/apis/request_logger.py +++ b/comfy_api_nodes/apis/request_logger.py @@ -4,16 +4,18 @@ import os import datetime import json import logging +import re +import hashlib +from typing import Any + import folder_paths # Get the logger instance logger = logging.getLogger(__name__) + def get_log_directory(): - """ - Ensures the API log directory exists within ComfyUI's temp directory - and returns its path. - """ + """Ensures the API log directory exists within ComfyUI's temp directory and returns its path.""" base_temp_dir = folder_paths.get_temp_directory() log_dir = os.path.join(base_temp_dir, "api_logs") try: @@ -24,42 +26,77 @@ def get_log_directory(): return base_temp_dir return log_dir -def _format_data_for_logging(data): + +def _sanitize_filename_component(name: str) -> str: + if not name: + return "log" + sanitized = re.sub(r"[^A-Za-z0-9._-]+", "_", name) # Replace disallowed characters with underscore + sanitized = sanitized.strip(" ._") # Windows: trailing dots or spaces are not allowed + if not sanitized: + sanitized = "log" + return sanitized + + +def _short_hash(*parts: str, length: int = 10) -> str: + return hashlib.sha1(("|".join(parts)).encode("utf-8")).hexdigest()[:length] + + +def _build_log_filepath(log_dir: str, operation_id: str, request_url: str) -> str: + """Build log filepath. We keep it well under common path length limits aiming for <= 240 characters total.""" + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f") + slug = _sanitize_filename_component(operation_id) # Best-effort human-readable slug from operation_id + h = _short_hash(operation_id or "", request_url or "") # Short hash ties log to the full operation and URL + + # Compute how much room we have for the slug given the directory length + # Keep total path length reasonably below ~260 on Windows. + max_total_path = 240 + prefix = f"{timestamp}_" + suffix = f"_{h}.log" + if not slug: + slug = "op" + max_filename_len = max(60, max_total_path - len(log_dir) - 1) + max_slug_len = max(8, max_filename_len - len(prefix) - len(suffix)) + if len(slug) > max_slug_len: + slug = slug[:max_slug_len].rstrip(" ._-") + return os.path.join(log_dir, f"{prefix}{slug}{suffix}") + + +def _format_data_for_logging(data: Any) -> str: """Helper to format data (dict, str, bytes) for logging.""" if isinstance(data, bytes): try: - return data.decode('utf-8') # Try to decode as text + return data.decode("utf-8") # Try to decode as text except UnicodeDecodeError: return f"[Binary data of length {len(data)} bytes]" elif isinstance(data, (dict, list)): try: return json.dumps(data, indent=2, ensure_ascii=False) except TypeError: - return str(data) # Fallback for non-serializable objects + return str(data) # Fallback for non-serializable objects return str(data) + def log_request_response( operation_id: str, request_method: str, request_url: str, request_headers: dict | None = None, request_params: dict | None = None, - request_data: any = None, + request_data: Any = None, response_status_code: int | None = None, response_headers: dict | None = None, - response_content: any = None, - error_message: str | None = None + response_content: Any = None, + error_message: str | None = None, ): """ Logs API request and response details to a file in the temp/api_logs directory. + Filenames are sanitized and length-limited for cross-platform safety. + If we still fail to write, we fall back to appending into api.log. """ log_dir = get_log_directory() - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f") - filename = f"{timestamp}_{operation_id.replace('/', '_').replace(':', '_')}.log" - filepath = os.path.join(log_dir, filename) - - log_content = [] + filepath = _build_log_filepath(log_dir, operation_id, request_url) + log_content: list[str] = [] log_content.append(f"Timestamp: {datetime.datetime.now().isoformat()}") log_content.append(f"Operation ID: {operation_id}") log_content.append("-" * 30 + " REQUEST " + "-" * 30) @@ -69,7 +106,7 @@ def log_request_response( log_content.append(f"Headers:\n{_format_data_for_logging(request_headers)}") if request_params: log_content.append(f"Params:\n{_format_data_for_logging(request_params)}") - if request_data: + if request_data is not None: log_content.append(f"Data/Body:\n{_format_data_for_logging(request_data)}") log_content.append("\n" + "-" * 30 + " RESPONSE " + "-" * 30) @@ -77,7 +114,7 @@ def log_request_response( log_content.append(f"Status Code: {response_status_code}") if response_headers: log_content.append(f"Headers:\n{_format_data_for_logging(response_headers)}") - if response_content: + if response_content is not None: log_content.append(f"Content:\n{_format_data_for_logging(response_content)}") if error_message: log_content.append(f"Error:\n{error_message}") @@ -89,6 +126,7 @@ def log_request_response( except Exception as e: logger.error(f"Error writing API log to {filepath}: {e}") + if __name__ == '__main__': # Example usage (for testing the logger directly) logger.setLevel(logging.DEBUG) From e9364ee279f65d0546fea1796c3cd2e0b7e1965f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 2 Oct 2025 14:57:15 -0700 Subject: [PATCH 385/759] Turn on TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL by default. (#10168) --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index 70696fcc3..35857dba8 100644 --- a/main.py +++ b/main.py @@ -115,6 +115,7 @@ if os.name == "nt": os.environ['MIMALLOC_PURGE_DELAY'] = '0' if __name__ == "__main__": + os.environ['TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL'] = '1' if args.default_device is not None: default_dev = args.default_device devices = list(range(32)) From 1395bce9f707e52ec613eeaa87ea690518cfe0a8 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 3 Oct 2025 01:20:29 +0300 Subject: [PATCH 386/759] update example_node to use V3 schema (#9723) --- custom_nodes/example_node.py.example | 161 +++++++++++---------------- 1 file changed, 68 insertions(+), 93 deletions(-) diff --git a/custom_nodes/example_node.py.example b/custom_nodes/example_node.py.example index 29ab2aa72..779c35787 100644 --- a/custom_nodes/example_node.py.example +++ b/custom_nodes/example_node.py.example @@ -1,96 +1,70 @@ -class Example: +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io + + +class Example(io.ComfyNode): """ - A example node + An example node Class methods ------------- - INPUT_TYPES (dict): - Tell the main program input parameters of nodes. - IS_CHANGED: + define_schema (io.Schema): + Tell the main program the metadata, input, output parameters of nodes. + fingerprint_inputs: optional method to control when the node is re executed. + check_lazy_status: + optional method to control list of input names that need to be evaluated. - Attributes - ---------- - RETURN_TYPES (`tuple`): - The type of each element in the output tuple. - RETURN_NAMES (`tuple`): - Optional: The name of each output in the output tuple. - FUNCTION (`str`): - The name of the entry-point method. For example, if `FUNCTION = "execute"` then it will run Example().execute() - OUTPUT_NODE ([`bool`]): - If this node is an output node that outputs a result/image from the graph. The SaveImage node is an example. - The backend iterates on these output nodes and tries to execute all their parents if their parent graph is properly connected. - Assumed to be False if not present. - CATEGORY (`str`): - The category the node should appear in the UI. - DEPRECATED (`bool`): - Indicates whether the node is deprecated. Deprecated nodes are hidden by default in the UI, but remain - functional in existing workflows that use them. - EXPERIMENTAL (`bool`): - Indicates whether the node is experimental. Experimental nodes are marked as such in the UI and may be subject to - significant changes or removal in future versions. Use with caution in production workflows. - execute(s) -> tuple || None: - The entry point method. The name of this method must be the same as the value of property `FUNCTION`. - For example, if `FUNCTION = "execute"` then this method's name must be `execute`, if `FUNCTION = "foo"` then it must be `foo`. """ - def __init__(self): - pass @classmethod - def INPUT_TYPES(s): + def define_schema(cls) -> io.Schema: """ - Return a dictionary which contains config for all input fields. - Some types (string): "MODEL", "VAE", "CLIP", "CONDITIONING", "LATENT", "IMAGE", "INT", "STRING", "FLOAT". - Input types "INT", "STRING" or "FLOAT" are special values for fields on the node. - The type can be a list for selection. - - Returns: `dict`: - - Key input_fields_group (`string`): Can be either required, hidden or optional. A node class must have property `required` - - Value input_fields (`dict`): Contains input fields config: - * Key field_name (`string`): Name of a entry-point method's argument - * Value field_config (`tuple`): - + First value is a string indicate the type of field or a list for selection. - + Second value is a config for type "INT", "STRING" or "FLOAT". + Return a schema which contains all information about the node. + Some types: "Model", "Vae", "Clip", "Conditioning", "Latent", "Image", "Int", "String", "Float", "Combo". + For outputs the "io.Model.Output" should be used, for inputs the "io.Model.Input" can be used. + The type can be a "Combo" - this will be a list for selection. """ - return { - "required": { - "image": ("IMAGE",), - "int_field": ("INT", { - "default": 0, - "min": 0, #Minimum value - "max": 4096, #Maximum value - "step": 64, #Slider's step - "display": "number", # Cosmetic only: display as "number" or "slider" - "lazy": True # Will only be evaluated if check_lazy_status requires it - }), - "float_field": ("FLOAT", { - "default": 1.0, - "min": 0.0, - "max": 10.0, - "step": 0.01, - "round": 0.001, #The value representing the precision to round to, will be set to the step value by default. Can be set to False to disable rounding. - "display": "number", - "lazy": True - }), - "print_to_screen": (["enable", "disable"],), - "string_field": ("STRING", { - "multiline": False, #True if you want the field to look like the one on the ClipTextEncode node - "default": "Hello World!", - "lazy": True - }), - }, - } + return io.Schema( + node_id="Example", + display_name="Example Node", + category="Example", + inputs=[ + io.Image.Input("image"), + io.Int.Input( + "int_field", + min=0, + max=4096, + step=64, # Slider's step + display_mode=io.NumberDisplay.number, # Cosmetic only: display as "number" or "slider" + lazy=True, # Will only be evaluated if check_lazy_status requires it + ), + io.Float.Input( + "float_field", + default=1.0, + min=0.0, + max=10.0, + step=0.01, + round=0.001, #The value representing the precision to round to, will be set to the step value by default. Can be set to False to disable rounding. + display_mode=io.NumberDisplay.number, + lazy=True, + ), + io.Combo.Input("print_to_screen", options=["enable", "disable"]), + io.String.Input( + "string_field", + multiline=False, # True if you want the field to look like the one on the ClipTextEncode node + default="Hello world!", + lazy=True, + ) + ], + outputs=[ + io.Image.Output(), + ], + ) - RETURN_TYPES = ("IMAGE",) - #RETURN_NAMES = ("image_output_name",) - - FUNCTION = "test" - - #OUTPUT_NODE = False - - CATEGORY = "Example" - - def check_lazy_status(self, image, string_field, int_field, float_field, print_to_screen): + @classmethod + def check_lazy_status(cls, image, string_field, int_field, float_field, print_to_screen): """ Return a list of input names that need to be evaluated. @@ -107,7 +81,8 @@ class Example: else: return [] - def test(self, image, string_field, int_field, float_field, print_to_screen): + @classmethod + def execute(cls, image, string_field, int_field, float_field, print_to_screen) -> io.NodeOutput: if print_to_screen == "enable": print(f"""Your input contains: string_field aka input text: {string_field} @@ -116,7 +91,7 @@ class Example: """) #do some processing on the image, in this example I just invert it image = 1.0 - image - return (image,) + return io.NodeOutput(image) """ The node will always be re executed if any of the inputs change but @@ -127,7 +102,7 @@ class Example: changes between executions the LoadImage node is executed again. """ #@classmethod - #def IS_CHANGED(s, image, string_field, int_field, float_field, print_to_screen): + #def fingerprint_inputs(s, image, string_field, int_field, float_field, print_to_screen): # return "" # Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension @@ -143,13 +118,13 @@ async def get_hello(request): return web.json_response("hello") -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "Example": Example -} +class ExampleExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + Example, + ] -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "Example": "Example Node" -} + +async def comfy_entrypoint() -> ExampleExtension: # ComfyUI calls this to load your extension and its nodes. + return ExampleExtension() From 4ffea0e864275301329ddb5ecc3fbc7211d7a802 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 3 Oct 2025 02:14:28 +0300 Subject: [PATCH 387/759] feat(linter, api-nodes): add pylint for comfy_api_nodes folder (#10157) --- .github/workflows/ruff.yml | 25 ++++++++++++++ comfy_api_nodes/apis/__init__.py | 1 + comfy_api_nodes/apis/client.py | 2 +- comfy_api_nodes/apis/rodin_api.py | 4 --- pyproject.toml | 54 +++++++++++++++++++++++++++++++ 5 files changed, 81 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 4c1a02594..b24d86a6b 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -21,3 +21,28 @@ jobs: - name: Run Ruff run: ruff check . + + pylint: + name: Run Pylint + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + - name: Install requirements + run: | + python -m pip install --upgrade pip + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install -r requirements.txt + + - name: Install Pylint + run: pip install pylint + + - name: Run Pylint + run: pylint comfy_api_nodes diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index 78a23db30..98f9e540d 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -2,6 +2,7 @@ # filename: filtered-openapi.yaml # timestamp: 2025-07-30T08:54:00+00:00 +# pylint: disable from __future__ import annotations from datetime import date, datetime diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py index 18a694675..79de3c262 100644 --- a/comfy_api_nodes/apis/client.py +++ b/comfy_api_nodes/apis/client.py @@ -535,7 +535,7 @@ class ApiClient: request_method="PUT", request_url=upload_url, response_status_code=e.status if hasattr(e, "status") else None, - response_headers=dict(e.headers) if getattr(e, "headers") else None, + response_headers=dict(e.headers) if hasattr(e, "headers") else None, response_content=None, error_message=f"{type(e).__name__}: {str(e)}", ) diff --git a/comfy_api_nodes/apis/rodin_api.py b/comfy_api_nodes/apis/rodin_api.py index 02cf42c29..fc26a6e73 100644 --- a/comfy_api_nodes/apis/rodin_api.py +++ b/comfy_api_nodes/apis/rodin_api.py @@ -52,7 +52,3 @@ class RodinResourceItem(BaseModel): class Rodin3DDownloadResponse(BaseModel): list: List[RodinResourceItem] = Field(..., description="Source List") - - - - diff --git a/pyproject.toml b/pyproject.toml index d0a76c6d0..598af4157 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,3 +22,57 @@ lint.select = [ "F", ] exclude = ["*.ipynb", "**/generated/*.pyi"] + +[tool.pylint] +master.py-version = "3.9" +master.extension-pkg-allow-list = [ + "pydantic", +] +reports.output-format = "colorized" +similarities.ignore-imports = "yes" +messages_control.disable = [ + "missing-module-docstring", + "missing-class-docstring", + "missing-function-docstring", + "line-too-long", + "too-few-public-methods", + "too-many-public-methods", + "too-many-instance-attributes", + "too-many-positional-arguments", + "broad-exception-raised", + "too-many-lines", + "invalid-name", + "unused-argument", + "broad-exception-caught", + "consider-using-with", + "fixme", + "too-many-statements", + "too-many-branches", + "too-many-locals", + "too-many-arguments", + "duplicate-code", + "abstract-method", + "superfluous-parens", + "arguments-differ", + "redefined-builtin", + "unnecessary-lambda", + "dangerous-default-value", + # next warnings should be fixed in future + "bad-classmethod-argument", # Class method should have 'cls' as first argument + "wrong-import-order", # Standard imports should be placed before third party imports + "logging-fstring-interpolation", # Use lazy % formatting in logging functions + "ungrouped-imports", + "unnecessary-pass", + "unidiomatic-typecheck", + "unnecessary-lambda-assignment", + "bad-indentation", + "no-else-return", + "no-else-raise", + "invalid-overridden-method", + "unused-variable", + "pointless-string-statement", + "inconsistent-return-statements", + "import-outside-toplevel", + "reimported", + "redefined-outer-name", +] From ed3ca78e080d697b6cf29497c07e14ee9c27a3ac Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 3 Oct 2025 21:26:34 +0300 Subject: [PATCH 388/759] feat(api-nodes): add kling-2-5-turbo to txt2video and img2video nodes (#10155) --- comfy_api_nodes/apis/__init__.py | 2 ++ comfy_api_nodes/nodes_kling.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index 98f9e540d..ee2aa1ce6 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -1321,6 +1321,7 @@ class KlingTextToVideoModelName(str, Enum): kling_v1 = 'kling-v1' kling_v1_6 = 'kling-v1-6' kling_v2_1_master = 'kling-v2-1-master' + kling_v2_5_turbo = 'kling-v2-5-turbo' class KlingVideoGenAspectRatio(str, Enum): @@ -1355,6 +1356,7 @@ class KlingVideoGenModelName(str, Enum): kling_v2_master = 'kling-v2-master' kling_v2_1 = 'kling-v2-1' kling_v2_1_master = 'kling-v2-1-master' + kling_v2_5_turbo = 'kling-v2-5-turbo' class KlingVideoResult(BaseModel): diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 5f55b2cc9..d8646f106 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -423,6 +423,8 @@ class KlingTextToVideoNode(KlingNodeBase): "standard mode / 10s duration / kling-v2-master": ("std", "10", "kling-v2-master"), "pro mode / 5s duration / kling-v2-1-master": ("pro", "5", "kling-v2-1-master"), "pro mode / 10s duration / kling-v2-1-master": ("pro", "10", "kling-v2-1-master"), + "pro mode / 5s duration / kling-v2-5-turbo": ("pro", "5", "kling-v2-5-turbo"), + "pro mode / 10s duration / kling-v2-5-turbo": ("pro", "10", "kling-v2-5-turbo"), } @classmethod From 8a293372ecdea0ff8647921eaf3bb10c3d992abf Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 3 Oct 2025 21:40:27 +0300 Subject: [PATCH 389/759] fix(api-nodes): reimport of base64 in Gemini node (#10181) --- comfy_api_nodes/nodes_gemini.py | 1 - pyproject.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index baa379b75..151cb4044 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -490,7 +490,6 @@ class GeminiInputFiles(ComfyNodeABC): # Use base64 string directly, not the data URI with open(file_path, "rb") as f: file_content = f.read() - import base64 base64_str = base64.b64encode(file_content).decode("utf-8") return GeminiPart( diff --git a/pyproject.toml b/pyproject.toml index 598af4157..7952f7f37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,5 @@ messages_control.disable = [ "pointless-string-statement", "inconsistent-return-statements", "import-outside-toplevel", - "reimported", "redefined-outer-name", ] From c2c5a7d5f80579bb44c11de0ce6eff94d1c111b9 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 3 Oct 2025 21:41:06 +0300 Subject: [PATCH 390/759] fix(api-nodes): bad indentation in Recraft API node function (#10175) --- comfy_api_nodes/nodes_recraft.py | 78 ++++++++++++++++---------------- pyproject.toml | 1 - 2 files changed, 39 insertions(+), 40 deletions(-) diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index c8516b368..a006104b7 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -38,48 +38,48 @@ from PIL import UnidentifiedImageError async def handle_recraft_file_request( - image: torch.Tensor, - path: str, - mask: torch.Tensor=None, - total_pixels=4096*4096, - timeout=1024, - request=None, - auth_kwargs: dict[str,str] = None, - ) -> list[BytesIO]: - """ - Handle sending common Recraft file-only request to get back file bytes. - """ - if request is None: - request = EmptyRequest() + image: torch.Tensor, + path: str, + mask: torch.Tensor=None, + total_pixels=4096*4096, + timeout=1024, + request=None, + auth_kwargs: dict[str,str] = None, +) -> list[BytesIO]: + """ + Handle sending common Recraft file-only request to get back file bytes. + """ + if request is None: + request = EmptyRequest() - files = { - 'image': tensor_to_bytesio(image, total_pixels=total_pixels).read() - } - if mask is not None: - files['mask'] = tensor_to_bytesio(mask, total_pixels=total_pixels).read() + files = { + 'image': tensor_to_bytesio(image, total_pixels=total_pixels).read() + } + if mask is not None: + files['mask'] = tensor_to_bytesio(mask, total_pixels=total_pixels).read() - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=type(request), - response_model=RecraftImageGenerationResponse, - ), - request=request, - files=files, - content_type="multipart/form-data", - auth_kwargs=auth_kwargs, - multipart_parser=recraft_multipart_parser, - ) - response: RecraftImageGenerationResponse = await operation.execute() - all_bytesio = [] - if response.image is not None: - all_bytesio.append(await download_url_to_bytesio(response.image.url, timeout=timeout)) - else: - for data in response.data: - all_bytesio.append(await download_url_to_bytesio(data.url, timeout=timeout)) + operation = SynchronousOperation( + endpoint=ApiEndpoint( + path=path, + method=HttpMethod.POST, + request_model=type(request), + response_model=RecraftImageGenerationResponse, + ), + request=request, + files=files, + content_type="multipart/form-data", + auth_kwargs=auth_kwargs, + multipart_parser=recraft_multipart_parser, + ) + response: RecraftImageGenerationResponse = await operation.execute() + all_bytesio = [] + if response.image is not None: + all_bytesio.append(await download_url_to_bytesio(response.image.url, timeout=timeout)) + else: + for data in response.data: + all_bytesio.append(await download_url_to_bytesio(data.url, timeout=timeout)) - return all_bytesio + return all_bytesio def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, converted_to_check: list[list]=None, is_list=False) -> dict: diff --git a/pyproject.toml b/pyproject.toml index 7952f7f37..240919a43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,6 @@ messages_control.disable = [ "unnecessary-pass", "unidiomatic-typecheck", "unnecessary-lambda-assignment", - "bad-indentation", "no-else-return", "no-else-raise", "invalid-overridden-method", From 3e68bc342cd60b909b4117c1b68a3afc62ef875c Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 3 Oct 2025 21:43:54 +0300 Subject: [PATCH 391/759] convert nodes_torch_compile.py to V3 schema (#10173) --- comfy_extras/nodes_torch_compile.py | 46 +++++++++++++++++++---------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/comfy_extras/nodes_torch_compile.py b/comfy_extras/nodes_torch_compile.py index 605536678..adbeece2f 100644 --- a/comfy_extras/nodes_torch_compile.py +++ b/comfy_extras/nodes_torch_compile.py @@ -1,23 +1,39 @@ +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io from comfy_api.torch_helpers import set_torch_compile_wrapper -class TorchCompileModel: +class TorchCompileModel(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "backend": (["inductor", "cudagraphs"],), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="TorchCompileModel", + category="_for_testing", + inputs=[ + io.Model.Input("model"), + io.Combo.Input( + "backend", + options=["inductor", "cudagraphs"], + ), + ], + outputs=[io.Model.Output()], + is_experimental=True, + ) - CATEGORY = "_for_testing" - EXPERIMENTAL = True - - def patch(self, model, backend): + @classmethod + def execute(cls, model, backend) -> io.NodeOutput: m = model.clone() set_torch_compile_wrapper(model=m, backend=backend) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "TorchCompileModel": TorchCompileModel, -} + +class TorchCompileExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TorchCompileModel, + ] + + +async def comfy_entrypoint() -> TorchCompileExtension: + return TorchCompileExtension() From d7aa414141f02a456801704a3da323fa2ed8f5cc Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 3 Oct 2025 21:45:02 +0300 Subject: [PATCH 392/759] convert nodes_eps.py to V3 schema (#10172) --- comfy_extras/nodes_eps.py | 62 ++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/comfy_extras/nodes_eps.py b/comfy_extras/nodes_eps.py index c8818f096..7852d85e5 100644 --- a/comfy_extras/nodes_eps.py +++ b/comfy_extras/nodes_eps.py @@ -1,4 +1,9 @@ -class EpsilonScaling: +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io + + +class EpsilonScaling(io.ComfyNode): """ Implements the Epsilon Scaling method from 'Elucidating the Exposure Bias in Diffusion Models' (https://arxiv.org/abs/2308.15321v6). @@ -8,26 +13,28 @@ class EpsilonScaling: recommended by the paper for its practicality and effectiveness. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ("MODEL",), - "scaling_factor": ("FLOAT", { - "default": 1.005, - "min": 0.5, - "max": 1.5, - "step": 0.001, - "display": "number" - }), - } - } + def define_schema(cls): + return io.Schema( + node_id="Epsilon Scaling", + category="model_patches/unet", + inputs=[ + io.Model.Input("model"), + io.Float.Input( + "scaling_factor", + default=1.005, + min=0.5, + max=1.5, + step=0.001, + display_mode=io.NumberDisplay.number, + ), + ], + outputs=[ + io.Model.Output(), + ], + ) - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" - - CATEGORY = "model_patches/unet" - - def patch(self, model, scaling_factor): + @classmethod + def execute(cls, model, scaling_factor) -> io.NodeOutput: # Prevent division by zero, though the UI's min value should prevent this. if scaling_factor == 0: scaling_factor = 1e-9 @@ -53,8 +60,15 @@ class EpsilonScaling: model_clone.set_model_sampler_post_cfg_function(epsilon_scaling_function) - return (model_clone,) + return io.NodeOutput(model_clone) -NODE_CLASS_MAPPINGS = { - "Epsilon Scaling": EpsilonScaling -} + +class EpsilonScalingExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + EpsilonScaling, + ] + +async def comfy_entrypoint() -> EpsilonScalingExtension: + return EpsilonScalingExtension() From 8c26d7bbe6663f589f0a9562921aafb3c48955c6 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 3 Oct 2025 21:48:21 +0300 Subject: [PATCH 393/759] convert nodes_pixverse.py to V3 schema (#10177) --- comfy_api_nodes/nodes_pixverse.py | 471 +++++++++++++++--------------- 1 file changed, 238 insertions(+), 233 deletions(-) diff --git a/comfy_api_nodes/nodes_pixverse.py b/comfy_api_nodes/nodes_pixverse.py index 7c5a52feb..eb98e9653 100644 --- a/comfy_api_nodes/nodes_pixverse.py +++ b/comfy_api_nodes/nodes_pixverse.py @@ -1,5 +1,7 @@ from inspect import cleandoc from typing import Optional +from typing_extensions import override +from io import BytesIO from comfy_api_nodes.apis.pixverse_api import ( PixverseTextVideoRequest, PixverseImageVideoRequest, @@ -26,12 +28,11 @@ from comfy_api_nodes.apinode_utils import ( tensor_to_bytesio, validate_string, ) -from comfy.comfy_types.node_typing import IO, ComfyNodeABC from comfy_api.input_impl import VideoFromFile +from comfy_api.latest import ComfyExtension, io as comfy_io import torch import aiohttp -from io import BytesIO AVERAGE_DURATION_T2V = 32 @@ -72,100 +73,101 @@ async def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None): return response_upload.Resp.img_id -class PixverseTemplateNode: +class PixverseTemplateNode(comfy_io.ComfyNode): """ Select template for PixVerse Video generation. """ - RETURN_TYPES = (PixverseIO.TEMPLATE,) - RETURN_NAMES = ("pixverse_template",) - FUNCTION = "create_template" - CATEGORY = "api node/video/PixVerse" + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="PixverseTemplateNode", + display_name="PixVerse Template", + category="api node/video/PixVerse", + inputs=[ + comfy_io.Combo.Input("template", options=[list(pixverse_templates.keys())]), + ], + outputs=[comfy_io.Custom(PixverseIO.TEMPLATE).Output(display_name="pixverse_template")], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "template": (list(pixverse_templates.keys()),), - } - } - - def create_template(self, template: str): + def execute(cls, template: str) -> comfy_io.NodeOutput: template_id = pixverse_templates.get(template, None) if template_id is None: raise Exception(f"Template '{template}' is not recognized.") # just return the integer - return (template_id,) + return comfy_io.NodeOutput(template_id) -class PixverseTextToVideoNode(ComfyNodeABC): +class PixverseTextToVideoNode(comfy_io.ComfyNode): """ Generates videos based on prompt and output_size. """ - RETURN_TYPES = (IO.VIDEO,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/video/PixVerse" + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="PixverseTextToVideoNode", + display_name="PixVerse Text to Video", + category="api node/video/PixVerse", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the video generation", + ), + comfy_io.Combo.Input( + "aspect_ratio", + options=[ratio.value for ratio in PixverseAspectRatio], + ), + comfy_io.Combo.Input( + "quality", + options=[resolution.value for resolution in PixverseQuality], + default=PixverseQuality.res_540p, + ), + comfy_io.Combo.Input( + "duration_seconds", + options=[dur.value for dur in PixverseDuration], + ), + comfy_io.Combo.Input( + "motion_mode", + options=[mode.value for mode in PixverseMotionMode], + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + control_after_generate=True, + tooltip="Seed for video generation.", + ), + comfy_io.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + comfy_io.Custom(PixverseIO.TEMPLATE).Input( + "pixverse_template", + tooltip="An optional template to influence style of generation, created by the PixVerse Template node.", + optional=True, + ), + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the video generation", - }, - ), - "aspect_ratio": ([ratio.value for ratio in PixverseAspectRatio],), - "quality": ( - [resolution.value for resolution in PixverseQuality], - { - "default": PixverseQuality.res_540p, - }, - ), - "duration_seconds": ([dur.value for dur in PixverseDuration],), - "motion_mode": ([mode.value for mode in PixverseMotionMode],), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2147483647, - "control_after_generate": True, - "tooltip": "Seed for video generation.", - }, - ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - "pixverse_template": ( - PixverseIO.TEMPLATE, - { - "tooltip": "An optional template to influence style of generation, created by the PixVerse Template node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, aspect_ratio: str, quality: str, @@ -174,9 +176,7 @@ class PixverseTextToVideoNode(ComfyNodeABC): seed, negative_prompt: str = None, pixverse_template: int = None, - unique_id: Optional[str] = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=False) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration @@ -186,6 +186,10 @@ class PixverseTextToVideoNode(ComfyNodeABC): elif duration_seconds != PixverseDuration.dur_5: motion_mode = PixverseMotionMode.normal + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/pixverse/video/text/generate", @@ -203,7 +207,7 @@ class PixverseTextToVideoNode(ComfyNodeABC): template_id=pixverse_template, seed=seed, ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -224,8 +228,8 @@ class PixverseTextToVideoNode(ComfyNodeABC): PixverseStatus.deleted, ], status_extractor=lambda x: x.Resp.status, - auth_kwargs=kwargs, - node_id=unique_id, + auth_kwargs=auth, + node_id=cls.hidden.unique_id, result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_T2V, ) @@ -233,77 +237,75 @@ class PixverseTextToVideoNode(ComfyNodeABC): async with aiohttp.ClientSession() as session: async with session.get(response_poll.Resp.url) as vid_response: - return (VideoFromFile(BytesIO(await vid_response.content.read())),) + return comfy_io.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) -class PixverseImageToVideoNode(ComfyNodeABC): +class PixverseImageToVideoNode(comfy_io.ComfyNode): """ Generates videos based on prompt and output_size. """ - RETURN_TYPES = (IO.VIDEO,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/video/PixVerse" + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="PixverseImageToVideoNode", + display_name="PixVerse Image to Video", + category="api node/video/PixVerse", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("image"), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the video generation", + ), + comfy_io.Combo.Input( + "quality", + options=[resolution.value for resolution in PixverseQuality], + default=PixverseQuality.res_540p, + ), + comfy_io.Combo.Input( + "duration_seconds", + options=[dur.value for dur in PixverseDuration], + ), + comfy_io.Combo.Input( + "motion_mode", + options=[mode.value for mode in PixverseMotionMode], + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + control_after_generate=True, + tooltip="Seed for video generation.", + ), + comfy_io.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + comfy_io.Custom(PixverseIO.TEMPLATE).Input( + "pixverse_template", + tooltip="An optional template to influence style of generation, created by the PixVerse Template node.", + optional=True, + ), + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the video generation", - }, - ), - "quality": ( - [resolution.value for resolution in PixverseQuality], - { - "default": PixverseQuality.res_540p, - }, - ), - "duration_seconds": ([dur.value for dur in PixverseDuration],), - "motion_mode": ([mode.value for mode in PixverseMotionMode],), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2147483647, - "control_after_generate": True, - "tooltip": "Seed for video generation.", - }, - ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - "pixverse_template": ( - PixverseIO.TEMPLATE, - { - "tooltip": "An optional template to influence style of generation, created by the PixVerse Template node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, image: torch.Tensor, prompt: str, quality: str, @@ -312,11 +314,13 @@ class PixverseImageToVideoNode(ComfyNodeABC): seed, negative_prompt: str = None, pixverse_template: int = None, - unique_id: Optional[str] = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=False) - img_id = await upload_image_to_pixverse(image, auth_kwargs=kwargs) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + img_id = await upload_image_to_pixverse(image, auth_kwargs=auth) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration @@ -343,7 +347,7 @@ class PixverseImageToVideoNode(ComfyNodeABC): template_id=pixverse_template, seed=seed, ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -364,8 +368,8 @@ class PixverseImageToVideoNode(ComfyNodeABC): PixverseStatus.deleted, ], status_extractor=lambda x: x.Resp.status, - auth_kwargs=kwargs, - node_id=unique_id, + auth_kwargs=auth, + node_id=cls.hidden.unique_id, result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_I2V, ) @@ -373,72 +377,71 @@ class PixverseImageToVideoNode(ComfyNodeABC): async with aiohttp.ClientSession() as session: async with session.get(response_poll.Resp.url) as vid_response: - return (VideoFromFile(BytesIO(await vid_response.content.read())),) + return comfy_io.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) -class PixverseTransitionVideoNode(ComfyNodeABC): +class PixverseTransitionVideoNode(comfy_io.ComfyNode): """ Generates videos based on prompt and output_size. """ - RETURN_TYPES = (IO.VIDEO,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/video/PixVerse" + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="PixverseTransitionVideoNode", + display_name="PixVerse Transition Video", + category="api node/video/PixVerse", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("first_frame"), + comfy_io.Image.Input("last_frame"), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the video generation", + ), + comfy_io.Combo.Input( + "quality", + options=[resolution.value for resolution in PixverseQuality], + default=PixverseQuality.res_540p, + ), + comfy_io.Combo.Input( + "duration_seconds", + options=[dur.value for dur in PixverseDuration], + ), + comfy_io.Combo.Input( + "motion_mode", + options=[mode.value for mode in PixverseMotionMode], + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + control_after_generate=True, + tooltip="Seed for video generation.", + ), + comfy_io.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "first_frame": (IO.IMAGE,), - "last_frame": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the video generation", - }, - ), - "quality": ( - [resolution.value for resolution in PixverseQuality], - { - "default": PixverseQuality.res_540p, - }, - ), - "duration_seconds": ([dur.value for dur in PixverseDuration],), - "motion_mode": ([mode.value for mode in PixverseMotionMode],), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2147483647, - "control_after_generate": True, - "tooltip": "Seed for video generation.", - }, - ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, first_frame: torch.Tensor, last_frame: torch.Tensor, prompt: str, @@ -447,12 +450,14 @@ class PixverseTransitionVideoNode(ComfyNodeABC): motion_mode: str, seed, negative_prompt: str = None, - unique_id: Optional[str] = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: validate_string(prompt, strip_whitespace=False) - first_frame_id = await upload_image_to_pixverse(first_frame, auth_kwargs=kwargs) - last_frame_id = await upload_image_to_pixverse(last_frame, auth_kwargs=kwargs) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + first_frame_id = await upload_image_to_pixverse(first_frame, auth_kwargs=auth) + last_frame_id = await upload_image_to_pixverse(last_frame, auth_kwargs=auth) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration @@ -479,7 +484,7 @@ class PixverseTransitionVideoNode(ComfyNodeABC): negative_prompt=negative_prompt if negative_prompt else None, seed=seed, ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) response_api = await operation.execute() @@ -500,8 +505,8 @@ class PixverseTransitionVideoNode(ComfyNodeABC): PixverseStatus.deleted, ], status_extractor=lambda x: x.Resp.status, - auth_kwargs=kwargs, - node_id=unique_id, + auth_kwargs=auth, + node_id=cls.hidden.unique_id, result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_T2V, ) @@ -509,19 +514,19 @@ class PixverseTransitionVideoNode(ComfyNodeABC): async with aiohttp.ClientSession() as session: async with session.get(response_poll.Resp.url) as vid_response: - return (VideoFromFile(BytesIO(await vid_response.content.read())),) + return comfy_io.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) -NODE_CLASS_MAPPINGS = { - "PixverseTextToVideoNode": PixverseTextToVideoNode, - "PixverseImageToVideoNode": PixverseImageToVideoNode, - "PixverseTransitionVideoNode": PixverseTransitionVideoNode, - "PixverseTemplateNode": PixverseTemplateNode, -} +class PixVerseExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + PixverseTextToVideoNode, + PixverseImageToVideoNode, + PixverseTransitionVideoNode, + PixverseTemplateNode, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "PixverseTextToVideoNode": "PixVerse Text to Video", - "PixverseImageToVideoNode": "PixVerse Image to Video", - "PixverseTransitionVideoNode": "PixVerse Transition Video", - "PixverseTemplateNode": "PixVerse Template", -} + +async def comfy_entrypoint() -> PixVerseExtension: + return PixVerseExtension() From 5c8e986e273d8af8b976fddbaed726e8278cf1fe Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 3 Oct 2025 21:50:38 +0300 Subject: [PATCH 394/759] convert nodes_tomesd.py to V3 schema (#10180) --- comfy_extras/nodes_tomesd.py | 50 +++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/comfy_extras/nodes_tomesd.py b/comfy_extras/nodes_tomesd.py index 9f77c06fc..87bf29b8f 100644 --- a/comfy_extras/nodes_tomesd.py +++ b/comfy_extras/nodes_tomesd.py @@ -1,7 +1,9 @@ #Taken from: https://github.com/dbolya/tomesd import torch -from typing import Tuple, Callable +from typing import Tuple, Callable, Optional +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io import math def do_nothing(x: torch.Tensor, mode:str=None): @@ -144,33 +146,45 @@ def get_functions(x, ratio, original_shape): -class TomePatchModel: +class TomePatchModel(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "ratio": ("FLOAT", {"default": 0.3, "min": 0.0, "max": 1.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="TomePatchModel", + category="model_patches/unet", + inputs=[ + io.Model.Input("model"), + io.Float.Input("ratio", default=0.3, min=0.0, max=1.0, step=0.01), + ], + outputs=[io.Model.Output()], + ) - CATEGORY = "model_patches/unet" - - def patch(self, model, ratio): - self.u = None + @classmethod + def execute(cls, model, ratio) -> io.NodeOutput: + u: Optional[Callable] = None def tomesd_m(q, k, v, extra_options): + nonlocal u #NOTE: In the reference code get_functions takes x (input of the transformer block) as the argument instead of q #however from my basic testing it seems that using q instead gives better results - m, self.u = get_functions(q, ratio, extra_options["original_shape"]) + m, u = get_functions(q, ratio, extra_options["original_shape"]) return m(q), k, v def tomesd_u(n, extra_options): - return self.u(n) + nonlocal u + return u(n) m = model.clone() m.set_model_attn1_patch(tomesd_m) m.set_model_attn1_output_patch(tomesd_u) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "TomePatchModel": TomePatchModel, -} +class TomePatchModelExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TomePatchModel, + ] + + +async def comfy_entrypoint() -> TomePatchModelExtension: + return TomePatchModelExtension() From 4614ee09ca1aaca7ee8067d6c5c30695582326ff Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 3 Oct 2025 23:24:42 +0300 Subject: [PATCH 395/759] convert nodes_edit_model.py to V3 schema (#10147) --- comfy_extras/nodes_edit_model.py | 46 ++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/comfy_extras/nodes_edit_model.py b/comfy_extras/nodes_edit_model.py index b69f79715..36da66f34 100644 --- a/comfy_extras/nodes_edit_model.py +++ b/comfy_extras/nodes_edit_model.py @@ -1,26 +1,38 @@ import node_helpers +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class ReferenceLatent: +class ReferenceLatent(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"conditioning": ("CONDITIONING", ), - }, - "optional": {"latent": ("LATENT", ),} - } + def define_schema(cls): + return io.Schema( + node_id="ReferenceLatent", + category="advanced/conditioning/edit_models", + description="This node sets the guiding latent for an edit model. If the model supports it you can chain multiple to set multiple reference images.", + inputs=[ + io.Conditioning.Input("conditioning"), + io.Latent.Input("latent", optional=True), + ], + outputs=[ + io.Conditioning.Output(), + ] + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "append" - - CATEGORY = "advanced/conditioning/edit_models" - DESCRIPTION = "This node sets the guiding latent for an edit model. If the model supports it you can chain multiple to set multiple reference images." - - def append(self, conditioning, latent=None): + @classmethod + def execute(cls, conditioning, latent=None) -> io.NodeOutput: if latent is not None: conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": [latent["samples"]]}, append=True) - return (conditioning, ) + return io.NodeOutput(conditioning) -NODE_CLASS_MAPPINGS = { - "ReferenceLatent": ReferenceLatent, -} +class EditModelExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + ReferenceLatent, + ] + + +def comfy_entrypoint() -> EditModelExtension: + return EditModelExtension() From 93d859cfaaad150c2a1e5e54c8f14765fa79ecb5 Mon Sep 17 00:00:00 2001 From: Finn-Hecker Date: Fri, 3 Oct 2025 23:32:19 +0200 Subject: [PATCH 396/759] Fix type annotation syntax in MotionEncoder_tc __init__ (#10186) ## Summary Fixed incorrect type hint syntax in `MotionEncoder_tc.__init__()` parameter list. ## Changes - Line 647: Changed `num_heads=int` to `num_heads: int` - This corrects the parameter annotation from a default value assignment to proper type hint syntax ## Details The parameter was using assignment syntax (`=`) instead of type annotation syntax (`:`), which would incorrectly set the default value to the `int` class itself rather than annotating the expected type. --- comfy/ldm/wan/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 0dc650ced..90c347d3d 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -903,7 +903,7 @@ class MotionEncoder_tc(nn.Module): def __init__(self, in_dim: int, hidden_dim: int, - num_heads=int, + num_heads: int, need_global=True, dtype=None, device=None, From 08726b64fe767f47bf074a05bedd6db45314c4c9 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:22:43 -0700 Subject: [PATCH 397/759] Update amd nightly command in readme. (#10189) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8f24a33ee..1224a6176 100644 --- a/README.md +++ b/README.md @@ -211,9 +211,9 @@ AMD users can install rocm and pytorch with pip if you don't have it already ins ```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4``` -This is the command to install the nightly with ROCm 6.4 which might have some performance improvements: +This is the command to install the nightly with ROCm 7.0 which might have some performance improvements: -```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4``` +```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.0``` ### Intel GPUs (Windows and Linux) From bbd683098e7d18700f025b2f0a4f6a44a3176602 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 3 Oct 2025 20:37:43 -0700 Subject: [PATCH 398/759] Add instructions to install nightly AMD pytorch for windows. (#10190) * Add instructions to install nightly AMD pytorch for windows. * Update README.md --- README.md | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 1224a6176..4a5a17cda 100644 --- a/README.md +++ b/README.md @@ -206,7 +206,8 @@ Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints Put your VAE in: models/vae -### AMD GPUs (Linux only) +### AMD GPUs (Linux) + AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version: ```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4``` @@ -215,6 +216,23 @@ This is the command to install the nightly with ROCm 7.0 which might have some p ```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.0``` + +### AMD GPUs (Experimental: Windows and Linux), RDNA 3, 3.5 and 4 only. + +These have less hardware support than the builds above but they work on windows. You also need to install the pytorch version specific to your hardware. + +RDNA 3 (RX 7000 series): + +```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx110X-dgpu/``` + +RDNA 3.5 (Strix halo/Ryzen AI Max+ 365): + +```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx1151/``` + +RDNA 4 (RX 9000 series): + +```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx120X-all/``` + ### Intel GPUs (Windows and Linux) (Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html) @@ -270,12 +288,6 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve > **Note**: Remember to add your models, VAE, LoRAs etc. to the corresponding Comfy folders, as discussed in [ComfyUI manual installation](#manual-install-windows-linux). -#### DirectML (AMD Cards on Windows) - -This is very badly supported and is not recommended. There are some unofficial builds of pytorch ROCm on windows that exist that will give you a much better experience than this. This readme will be updated once official pytorch ROCm builds for windows come out. - -```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml``` - #### Ascend NPUs For models compatible with Ascend Extension for PyTorch (torch_npu). To get started, ensure your environment meets the prerequisites outlined on the [installation](https://ascend.github.io/docs/sources/ascend/quick_install.html) page. Here's a step-by-step guide tailored to your platform and installation method: From 22f99fb97edaccf450152c8bf7c4068c1d331899 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 4 Oct 2025 22:22:57 +0300 Subject: [PATCH 399/759] fix(api-nodes): enable 2 more pylint rules, removed non needed code (#10192) --- comfy_api_nodes/nodes_gemini.py | 3 +- comfy_api_nodes/nodes_moonvalley.py | 49 ++--------------------------- pyproject.toml | 2 -- 3 files changed, 4 insertions(+), 50 deletions(-) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 151cb4044..309e9a2d2 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -39,6 +39,7 @@ from comfy_api_nodes.apinode_utils import ( tensor_to_base64_string, bytesio_to_image_tensor, ) +from comfy_api.util import VideoContainer, VideoCodec GEMINI_BASE_ENDPOINT = "/proxy/vertexai/gemini" @@ -310,7 +311,7 @@ class GeminiNode(ComfyNodeABC): Returns: List of GeminiPart objects containing the encoded video. """ - from comfy_api.util import VideoContainer, VideoCodec + base_64_string = video_to_base64_string( video_input, container_format=VideoContainer.MP4, diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 08e838fef..6467dd614 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -2,11 +2,7 @@ import logging from typing import Any, Callable, Optional, TypeVar import torch from typing_extensions import override -from comfy_api_nodes.util.validation_utils import ( - get_image_dimensions, - validate_image_dimensions, -) - +from comfy_api_nodes.util.validation_utils import validate_image_dimensions from comfy_api_nodes.apis import ( MoonvalleyTextToVideoRequest, @@ -132,47 +128,6 @@ def validate_prompts( return True -def validate_input_media(width, height, with_frame_conditioning, num_frames_in=None): - # inference validation - # T = num_frames - # in all cases, the following must be true: T divisible by 16 and H,W by 8. in addition... - # with image conditioning: H*W must be divisible by 8192 - # without image conditioning: T divisible by 32 - if num_frames_in and not num_frames_in % 16 == 0: - return False, ("The input video total frame count must be divisible by 16!") - - if height % 8 != 0 or width % 8 != 0: - return False, ( - f"Height ({height}) and width ({width}) must be " "divisible by 8" - ) - - if with_frame_conditioning: - if (height * width) % 8192 != 0: - return False, ( - f"Height * width ({height * width}) must be " - "divisible by 8192 for frame conditioning" - ) - else: - if num_frames_in and not num_frames_in % 32 == 0: - return False, ("The input video total frame count must be divisible by 32!") - - -def validate_input_image( - image: torch.Tensor, with_frame_conditioning: bool = False -) -> None: - """ - Validates the input image adheres to the expectations of the API: - - The image resolution should not be less than 300*300px - - The aspect ratio of the image should be between 1:2.5 ~ 2.5:1 - - """ - height, width = get_image_dimensions(image) - validate_input_media(width, height, with_frame_conditioning) - validate_image_dimensions( - image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH - ) - - def validate_video_to_video_input(video: VideoInput) -> VideoInput: """ Validates and processes video input for Moonvalley Video-to-Video generation. @@ -499,7 +454,7 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): seed: int, steps: int, ) -> comfy_io.NodeOutput: - validate_input_image(image, True) + validate_image_dimensions(image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH) validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = parse_width_height_from_res(resolution) diff --git a/pyproject.toml b/pyproject.toml index 240919a43..383e7d10a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,5 @@ messages_control.disable = [ "invalid-overridden-method", "unused-variable", "pointless-string-statement", - "inconsistent-return-statements", - "import-outside-toplevel", "redefined-outer-name", ] From 2ed74f7ac78d3ff713d0a8583695c31055914b76 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 4 Oct 2025 22:29:09 +0300 Subject: [PATCH 400/759] convert nodes_rodin.py to V3 schema (#10195) --- comfy_api_nodes/nodes_rodin.py | 941 +++++++++++++++++---------------- 1 file changed, 478 insertions(+), 463 deletions(-) diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index 633ac46d3..bd758f762 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -7,14 +7,15 @@ Rodin API docs: https://developer.hyper3d.ai/ from __future__ import annotations from inspect import cleandoc -from comfy.comfy_types.node_typing import IO import folder_paths as comfy_paths import aiohttp import os import asyncio -import io import logging import math +from typing import Optional +from io import BytesIO +from typing_extensions import override from PIL import Image from comfy_api_nodes.apis.rodin_api import ( Rodin3DGenerateRequest, @@ -31,428 +32,436 @@ from comfy_api_nodes.apis.client import ( SynchronousOperation, PollingOperation, ) +from comfy_api.latest import ComfyExtension, io as comfy_io -COMMON_PARAMETERS = { - "Seed": ( - IO.INT, - { - "default":0, - "min":0, - "max":65535, - "display":"number" - } +COMMON_PARAMETERS = [ + comfy_io.Int.Input( + "Seed", + default=0, + min=0, + max=65535, + display_mode=comfy_io.NumberDisplay.number, + optional=True, ), - "Material_Type": ( - IO.COMBO, - { - "options": ["PBR", "Shaded"], - "default": "PBR" - } + comfy_io.Combo.Input("Material_Type", options=["PBR", "Shaded"], default="PBR", optional=True), + comfy_io.Combo.Input( + "Polygon_count", + options=["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "200K-Triangle"], + default="18K-Quad", + optional=True, ), - "Polygon_count": ( - IO.COMBO, - { - "options": ["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "200K-Triangle"], - "default": "18K-Quad" - } +] + + +def get_quality_mode(poly_count): + polycount = poly_count.split("-") + poly = polycount[1] + count = polycount[0] + if poly == "Triangle": + mesh_mode = "Raw" + elif poly == "Quad": + mesh_mode = "Quad" + else: + mesh_mode = "Quad" + + if count == "4K": + quality_override = 4000 + elif count == "8K": + quality_override = 8000 + elif count == "18K": + quality_override = 18000 + elif count == "50K": + quality_override = 50000 + elif count == "2K": + quality_override = 2000 + elif count == "20K": + quality_override = 20000 + elif count == "150K": + quality_override = 150000 + elif count == "500K": + quality_override = 500000 + else: + quality_override = 18000 + + return mesh_mode, quality_override + + +def tensor_to_filelike(tensor, max_pixels: int = 2048*2048): + """ + Converts a PyTorch tensor to a file-like object. + + Args: + - tensor (torch.Tensor): A tensor representing an image of shape (H, W, C) + where C is the number of channels (3 for RGB), H is height, and W is width. + + Returns: + - io.BytesIO: A file-like object containing the image data. + """ + array = tensor.cpu().numpy() + array = (array * 255).astype('uint8') + image = Image.fromarray(array, 'RGB') + + original_width, original_height = image.size + original_pixels = original_width * original_height + if original_pixels > max_pixels: + scale = math.sqrt(max_pixels / original_pixels) + new_width = int(original_width * scale) + new_height = int(original_height * scale) + else: + new_width, new_height = original_width, original_height + + if new_width != original_width or new_height != original_height: + image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) + + img_byte_arr = BytesIO() + image.save(img_byte_arr, format='PNG') # PNG is used for lossless compression + img_byte_arr.seek(0) + return img_byte_arr + + +async def create_generate_task( + images=None, + seed=1, + material="PBR", + quality_override=18000, + tier="Regular", + mesh_mode="Quad", + TAPose = False, + auth_kwargs: Optional[dict[str, str]] = None, +): + if images is None: + raise Exception("Rodin 3D generate requires at least 1 image.") + if len(images) > 5: + raise Exception("Rodin 3D generate requires up to 5 image.") + + path = "/proxy/rodin/api/v2/rodin" + operation = SynchronousOperation( + endpoint=ApiEndpoint( + path=path, + method=HttpMethod.POST, + request_model=Rodin3DGenerateRequest, + response_model=Rodin3DGenerateResponse, + ), + request=Rodin3DGenerateRequest( + seed=seed, + tier=tier, + material=material, + quality_override=quality_override, + mesh_mode=mesh_mode, + TAPose=TAPose, + ), + files=[ + ( + "images", + open(image, "rb") if isinstance(image, str) else tensor_to_filelike(image) + ) + for image in images if image is not None + ], + content_type="multipart/form-data", + auth_kwargs=auth_kwargs, ) -} -def create_task_error(response: Rodin3DGenerateResponse): - """Check if the response has error""" - return hasattr(response, "error") + response = await operation.execute() + + if hasattr(response, "error"): + error_message = f"Rodin3D Create 3D generate Task Failed. Message: {response.message}, error: {response.error}" + logging.error(error_message) + raise Exception(error_message) + + logging.info("[ Rodin3D API - Submit Jobs ] Submit Generate Task Success!") + subscription_key = response.jobs.subscription_key + task_uuid = response.uuid + logging.info(f"[ Rodin3D API - Submit Jobs ] UUID: {task_uuid}") + return task_uuid, subscription_key -class Rodin3DAPI: - """ - Generate 3D Assets using Rodin API - """ - RETURN_TYPES = (IO.STRING,) - RETURN_NAMES = ("3D Model Path",) - CATEGORY = "api node/3d/Rodin" - DESCRIPTION = cleandoc(__doc__ or "") - FUNCTION = "api_call" - API_NODE = True - - def tensor_to_filelike(self, tensor, max_pixels: int = 2048*2048): - """ - Converts a PyTorch tensor to a file-like object. - - Args: - - tensor (torch.Tensor): A tensor representing an image of shape (H, W, C) - where C is the number of channels (3 for RGB), H is height, and W is width. - - Returns: - - io.BytesIO: A file-like object containing the image data. - """ - array = tensor.cpu().numpy() - array = (array * 255).astype('uint8') - image = Image.fromarray(array, 'RGB') - - original_width, original_height = image.size - original_pixels = original_width * original_height - if original_pixels > max_pixels: - scale = math.sqrt(max_pixels / original_pixels) - new_width = int(original_width * scale) - new_height = int(original_height * scale) - else: - new_width, new_height = original_width, original_height - - if new_width != original_width or new_height != original_height: - image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) - - img_byte_arr = io.BytesIO() - image.save(img_byte_arr, format='PNG') # PNG is used for lossless compression - img_byte_arr.seek(0) - return img_byte_arr - - def check_rodin_status(self, response: Rodin3DCheckStatusResponse) -> str: - has_failed = any(job.status == JobStatus.Failed for job in response.jobs) - all_done = all(job.status == JobStatus.Done for job in response.jobs) - status_list = [str(job.status) for job in response.jobs] - logging.info(f"[ Rodin3D API - CheckStatus ] Generate Status: {status_list}") - if has_failed: - logging.error(f"[ Rodin3D API - CheckStatus ] Generate Failed: {status_list}, Please try again.") - raise Exception("[ Rodin3D API ] Generate Failed, Please Try again.") - elif all_done: - return "DONE" - else: - return "Generating" - - async def create_generate_task(self, images=None, seed=1, material="PBR", quality_override=18000, tier="Regular", mesh_mode="Quad", TAPose = False, **kwargs): - if images is None: - raise Exception("Rodin 3D generate requires at least 1 image.") - if len(images) > 5: - raise Exception("Rodin 3D generate requires up to 5 image.") - - path = "/proxy/rodin/api/v2/rodin" - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=Rodin3DGenerateRequest, - response_model=Rodin3DGenerateResponse, - ), - request=Rodin3DGenerateRequest( - seed=seed, - tier=tier, - material=material, - quality_override=quality_override, - mesh_mode=mesh_mode, - TAPose=TAPose, - ), - files=[ - ( - "images", - open(image, "rb") if isinstance(image, str) else self.tensor_to_filelike(image) - ) - for image in images if image is not None - ], - content_type = "multipart/form-data", - auth_kwargs=kwargs, - ) - - response = await operation.execute() - - if create_task_error(response): - error_message = f"Rodin3D Create 3D generate Task Failed. Message: {response.message}, error: {response.error}" - logging.error(error_message) - raise Exception(error_message) - - logging.info("[ Rodin3D API - Submit Jobs ] Submit Generate Task Success!") - subscription_key = response.jobs.subscription_key - task_uuid = response.uuid - logging.info(f"[ Rodin3D API - Submit Jobs ] UUID: {task_uuid}") - return task_uuid, subscription_key - - async def poll_for_task_status(self, subscription_key, **kwargs) -> Rodin3DCheckStatusResponse: - - path = "/proxy/rodin/api/v2/status" - - poll_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path = path, - method=HttpMethod.POST, - request_model=Rodin3DCheckStatusRequest, - response_model=Rodin3DCheckStatusResponse, - ), - request=Rodin3DCheckStatusRequest( - subscription_key = subscription_key - ), - completed_statuses=["DONE"], - failed_statuses=["FAILED"], - status_extractor=self.check_rodin_status, - poll_interval=3.0, - auth_kwargs=kwargs, - ) - - logging.info("[ Rodin3D API - CheckStatus ] Generate Start!") - - return await poll_operation.execute() - - async def get_rodin_download_list(self, uuid, **kwargs) -> Rodin3DDownloadResponse: - logging.info("[ Rodin3D API - Downloading ] Generate Successfully!") - - path = "/proxy/rodin/api/v2/download" - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=Rodin3DDownloadRequest, - response_model=Rodin3DDownloadResponse, - ), - request=Rodin3DDownloadRequest( - task_uuid=uuid - ), - auth_kwargs=kwargs - ) - - return await operation.execute() - - def get_quality_mode(self, poly_count): - polycount = poly_count.split("-") - poly = polycount[1] - count = polycount[0] - if poly == "Triangle": - mesh_mode = "Raw" - elif poly == "Quad": - mesh_mode = "Quad" - else: - mesh_mode = "Quad" - - if count == "4K": - quality_override = 4000 - elif count == "8K": - quality_override = 8000 - elif count == "18K": - quality_override = 18000 - elif count == "50K": - quality_override = 50000 - elif count == "2K": - quality_override = 2000 - elif count == "20K": - quality_override = 20000 - elif count == "150K": - quality_override = 150000 - elif count == "500K": - quality_override = 500000 - else: - quality_override = 18000 - - return mesh_mode, quality_override - - async def download_files(self, url_list, task_uuid): - save_path = os.path.join(comfy_paths.get_output_directory(), f"Rodin3D_{task_uuid}") - os.makedirs(save_path, exist_ok=True) - model_file_path = None - async with aiohttp.ClientSession() as session: - for i in url_list.list: - url = i.url - file_name = i.name - file_path = os.path.join(save_path, file_name) - if file_path.endswith(".glb"): - model_file_path = file_path - logging.info(f"[ Rodin3D API - download_files ] Downloading file: {file_path}") - max_retries = 5 - for attempt in range(max_retries): - try: - async with session.get(url) as resp: - resp.raise_for_status() - with open(file_path, "wb") as f: - async for chunk in resp.content.iter_chunked(32 * 1024): - f.write(chunk) - break - except Exception as e: - logging.info(f"[ Rodin3D API - download_files ] Error downloading {file_path}:{e}") - if attempt < max_retries - 1: - logging.info("Retrying...") - await asyncio.sleep(2) - else: - logging.info( - "[ Rodin3D API - download_files ] Failed to download %s after %s attempts.", - file_path, - max_retries, - ) - - return model_file_path +def check_rodin_status(response: Rodin3DCheckStatusResponse) -> str: + all_done = all(job.status == JobStatus.Done for job in response.jobs) + status_list = [str(job.status) for job in response.jobs] + logging.info(f"[ Rodin3D API - CheckStatus ] Generate Status: {status_list}") + if any(job.status == JobStatus.Failed for job in response.jobs): + logging.error(f"[ Rodin3D API - CheckStatus ] Generate Failed: {status_list}, Please try again.") + raise Exception("[ Rodin3D API ] Generate Failed, Please Try again.") + if all_done: + return "DONE" + return "Generating" -class Rodin3D_Regular(Rodin3DAPI): +async def poll_for_task_status( + subscription_key, auth_kwargs: Optional[dict[str, str]] = None, +) -> Rodin3DCheckStatusResponse: + poll_operation = PollingOperation( + poll_endpoint=ApiEndpoint( + path="/proxy/rodin/api/v2/status", + method=HttpMethod.POST, + request_model=Rodin3DCheckStatusRequest, + response_model=Rodin3DCheckStatusResponse, + ), + request=Rodin3DCheckStatusRequest(subscription_key=subscription_key), + completed_statuses=["DONE"], + failed_statuses=["FAILED"], + status_extractor=check_rodin_status, + poll_interval=3.0, + auth_kwargs=auth_kwargs, + ) + logging.info("[ Rodin3D API - CheckStatus ] Generate Start!") + return await poll_operation.execute() + + +async def get_rodin_download_list(uuid, auth_kwargs: Optional[dict[str, str]] = None) -> Rodin3DDownloadResponse: + logging.info("[ Rodin3D API - Downloading ] Generate Successfully!") + operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/rodin/api/v2/download", + method=HttpMethod.POST, + request_model=Rodin3DDownloadRequest, + response_model=Rodin3DDownloadResponse, + ), + request=Rodin3DDownloadRequest(task_uuid=uuid), + auth_kwargs=auth_kwargs, + ) + return await operation.execute() + + +async def download_files(url_list, task_uuid): + save_path = os.path.join(comfy_paths.get_output_directory(), f"Rodin3D_{task_uuid}") + os.makedirs(save_path, exist_ok=True) + model_file_path = None + async with aiohttp.ClientSession() as session: + for i in url_list.list: + url = i.url + file_name = i.name + file_path = os.path.join(save_path, file_name) + if file_path.endswith(".glb"): + model_file_path = file_path + logging.info(f"[ Rodin3D API - download_files ] Downloading file: {file_path}") + max_retries = 5 + for attempt in range(max_retries): + try: + async with session.get(url) as resp: + resp.raise_for_status() + with open(file_path, "wb") as f: + async for chunk in resp.content.iter_chunked(32 * 1024): + f.write(chunk) + break + except Exception as e: + logging.info(f"[ Rodin3D API - download_files ] Error downloading {file_path}:{e}") + if attempt < max_retries - 1: + logging.info("Retrying...") + await asyncio.sleep(2) + else: + logging.info( + "[ Rodin3D API - download_files ] Failed to download %s after %s attempts.", + file_path, + max_retries, + ) + return model_file_path + + +class Rodin3D_Regular(comfy_io.ComfyNode): + """Generate 3D Assets using Rodin API""" + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "Images": - ( - IO.IMAGE, - { - "forceInput":True, - } - ) - }, - "optional": { - **COMMON_PARAMETERS - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="Rodin3D_Regular", + display_name="Rodin 3D Generate - Regular Generate", + category="api node/3d/Rodin", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("Images"), + *COMMON_PARAMETERS, + ], + outputs=[comfy_io.String.Output(display_name="3D Model Path")], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + ], + is_api_node=True, + ) - async def api_call( - self, + @classmethod + async def execute( + cls, Images, Seed, Material_Type, Polygon_count, - **kwargs - ): + ) -> comfy_io.NodeOutput: tier = "Regular" num_images = Images.shape[0] m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality_override = self.get_quality_mode(Polygon_count) - task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, - quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, - **kwargs) - await self.poll_for_task_status(subscription_key, **kwargs) - download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list, task_uuid) - - return (model,) - - -class Rodin3D_Detail(Rodin3DAPI): - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "Images": - ( - IO.IMAGE, - { - "forceInput":True, - } - ) - }, - "optional": { - **COMMON_PARAMETERS - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, + mesh_mode, quality_override = get_quality_mode(Polygon_count) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, } + task_uuid, subscription_key = await create_generate_task( + images=m_images, + seed=Seed, + material=Material_Type, + quality_override=quality_override, + tier=tier, + mesh_mode=mesh_mode, + auth_kwargs=auth, + ) + await poll_for_task_status(subscription_key, auth_kwargs=auth) + download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) + model = await download_files(download_list, task_uuid) - async def api_call( - self, + return comfy_io.NodeOutput(model) + + +class Rodin3D_Detail(comfy_io.ComfyNode): + """Generate 3D Assets using Rodin API""" + + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="Rodin3D_Detail", + display_name="Rodin 3D Generate - Detail Generate", + category="api node/3d/Rodin", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("Images"), + *COMMON_PARAMETERS, + ], + outputs=[comfy_io.String.Output(display_name="3D Model Path")], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, Images, Seed, Material_Type, Polygon_count, - **kwargs - ): + ) -> comfy_io.NodeOutput: tier = "Detail" num_images = Images.shape[0] m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality_override = self.get_quality_mode(Polygon_count) - task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, - quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, - **kwargs) - await self.poll_for_task_status(subscription_key, **kwargs) - download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list, task_uuid) - - return (model,) - - -class Rodin3D_Smooth(Rodin3DAPI): - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "Images": - ( - IO.IMAGE, - { - "forceInput":True, - } - ) - }, - "optional": { - **COMMON_PARAMETERS - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, + mesh_mode, quality_override = get_quality_mode(Polygon_count) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, } + task_uuid, subscription_key = await create_generate_task( + images=m_images, + seed=Seed, + material=Material_Type, + quality_override=quality_override, + tier=tier, + mesh_mode=mesh_mode, + auth_kwargs=auth, + ) + await poll_for_task_status(subscription_key, auth_kwargs=auth) + download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) + model = await download_files(download_list, task_uuid) - async def api_call( - self, + return comfy_io.NodeOutput(model) + + +class Rodin3D_Smooth(comfy_io.ComfyNode): + """Generate 3D Assets using Rodin API""" + + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="Rodin3D_Smooth", + display_name="Rodin 3D Generate - Smooth Generate", + category="api node/3d/Rodin", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("Images"), + *COMMON_PARAMETERS, + ], + outputs=[comfy_io.String.Output(display_name="3D Model Path")], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, Images, Seed, Material_Type, Polygon_count, - **kwargs - ): + ) -> comfy_io.NodeOutput: tier = "Smooth" num_images = Images.shape[0] m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality_override = self.get_quality_mode(Polygon_count) - task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, - quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, - **kwargs) - await self.poll_for_task_status(subscription_key, **kwargs) - download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list, task_uuid) - - return (model,) - - -class Rodin3D_Sketch(Rodin3DAPI): - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "Images": - ( - IO.IMAGE, - { - "forceInput":True, - } - ) - }, - "optional": { - "Seed": - ( - IO.INT, - { - "default":0, - "min":0, - "max":65535, - "display":"number" - } - ) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, + mesh_mode, quality_override = get_quality_mode(Polygon_count) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, } + task_uuid, subscription_key = await create_generate_task( + images=m_images, + seed=Seed, + material=Material_Type, + quality_override=quality_override, + tier=tier, + mesh_mode=mesh_mode, + auth_kwargs=auth, + ) + await poll_for_task_status(subscription_key, auth_kwargs=auth) + download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) + model = await download_files(download_list, task_uuid) - async def api_call( - self, + return comfy_io.NodeOutput(model) + + +class Rodin3D_Sketch(comfy_io.ComfyNode): + """Generate 3D Assets using Rodin API""" + + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="Rodin3D_Sketch", + display_name="Rodin 3D Generate - Sketch Generate", + category="api node/3d/Rodin", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("Images"), + comfy_io.Int.Input( + "Seed", + default=0, + min=0, + max=65535, + display_mode=comfy_io.NumberDisplay.number, + optional=True, + ), + ], + outputs=[comfy_io.String.Output(display_name="3D Model Path")], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, Images, Seed, - **kwargs - ): + ) -> comfy_io.NodeOutput: tier = "Sketch" num_images = Images.shape[0] m_images = [] @@ -461,104 +470,110 @@ class Rodin3D_Sketch(Rodin3DAPI): material_type = "PBR" quality_override = 18000 mesh_mode = "Quad" - task_uuid, subscription_key = await self.create_generate_task( - images=m_images, seed=Seed, material=material_type, quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, **kwargs - ) - await self.poll_for_task_status(subscription_key, **kwargs) - download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list, task_uuid) - - return (model,) - -class Rodin3D_Gen2(Rodin3DAPI): - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "Images": - ( - IO.IMAGE, - { - "forceInput":True, - } - ) - }, - "optional": { - "Seed": ( - IO.INT, - { - "default":0, - "min":0, - "max":65535, - "display":"number" - } - ), - "Material_Type": ( - IO.COMBO, - { - "options": ["PBR", "Shaded"], - "default": "PBR" - } - ), - "Polygon_count": ( - IO.COMBO, - { - "options": ["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "2K-Triangle", "20K-Triangle", "150K-Triangle", "500K-Triangle"], - "default": "500K-Triangle" - } - ), - "TAPose": ( - IO.BOOLEAN, - { - "default": False, - } - ) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, } + task_uuid, subscription_key = await create_generate_task( + images=m_images, + seed=Seed, + material=material_type, + quality_override=quality_override, + tier=tier, + mesh_mode=mesh_mode, + auth_kwargs=auth, + ) + await poll_for_task_status(subscription_key, auth_kwargs=auth) + download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) + model = await download_files(download_list, task_uuid) - async def api_call( - self, + return comfy_io.NodeOutput(model) + + +class Rodin3D_Gen2(comfy_io.ComfyNode): + """Generate 3D Assets using Rodin API""" + + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="Rodin3D_Gen2", + display_name="Rodin 3D Generate - Gen-2 Generate", + category="api node/3d/Rodin", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + comfy_io.Image.Input("Images"), + comfy_io.Int.Input( + "Seed", + default=0, + min=0, + max=65535, + display_mode=comfy_io.NumberDisplay.number, + optional=True, + ), + comfy_io.Combo.Input("Material_Type", options=["PBR", "Shaded"], default="PBR", optional=True), + comfy_io.Combo.Input( + "Polygon_count", + options=["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "2K-Triangle", "20K-Triangle", "150K-Triangle", "500K-Triangle"], + default="500K-Triangle", + optional=True, + ), + comfy_io.Boolean.Input("TAPose", default=False), + ], + outputs=[comfy_io.String.Output(display_name="3D Model Path")], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, Images, Seed, Material_Type, Polygon_count, TAPose, - **kwargs - ): + ) -> comfy_io.NodeOutput: tier = "Gen-2" num_images = Images.shape[0] m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality_override = self.get_quality_mode(Polygon_count) - task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, - quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, TAPose=TAPose, - **kwargs) - await self.poll_for_task_status(subscription_key, **kwargs) - download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list, task_uuid) + mesh_mode, quality_override = get_quality_mode(Polygon_count) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + task_uuid, subscription_key = await create_generate_task( + images=m_images, + seed=Seed, + material=Material_Type, + quality_override=quality_override, + tier=tier, + mesh_mode=mesh_mode, + TAPose=TAPose, + auth_kwargs=auth, + ) + await poll_for_task_status(subscription_key, auth_kwargs=auth) + download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) + model = await download_files(download_list, task_uuid) - return (model,) + return comfy_io.NodeOutput(model) -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "Rodin3D_Regular": Rodin3D_Regular, - "Rodin3D_Detail": Rodin3D_Detail, - "Rodin3D_Smooth": Rodin3D_Smooth, - "Rodin3D_Sketch": Rodin3D_Sketch, - "Rodin3D_Gen2": Rodin3D_Gen2, -} -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "Rodin3D_Regular": "Rodin 3D Generate - Regular Generate", - "Rodin3D_Detail": "Rodin 3D Generate - Detail Generate", - "Rodin3D_Smooth": "Rodin 3D Generate - Smooth Generate", - "Rodin3D_Sketch": "Rodin 3D Generate - Sketch Generate", - "Rodin3D_Gen2": "Rodin 3D Generate - Gen-2 Generate", -} +class Rodin3DExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + Rodin3D_Regular, + Rodin3D_Detail, + Rodin3D_Smooth, + Rodin3D_Sketch, + Rodin3D_Gen2, + ] + + +async def comfy_entrypoint() -> Rodin3DExtension: + return Rodin3DExtension() From b1fa1922df597af759150f4e26ecb276c9753ee4 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 4 Oct 2025 22:33:48 +0300 Subject: [PATCH 401/759] convert nodes_stable3d.py to V3 schema (#10204) --- comfy_extras/nodes_stable3d.py | 149 +++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 63 deletions(-) diff --git a/comfy_extras/nodes_stable3d.py b/comfy_extras/nodes_stable3d.py index be2e34c28..c6d8a683d 100644 --- a/comfy_extras/nodes_stable3d.py +++ b/comfy_extras/nodes_stable3d.py @@ -1,6 +1,8 @@ import torch import nodes import comfy.utils +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io def camera_embeddings(elevation, azimuth): elevation = torch.as_tensor([elevation]) @@ -20,26 +22,31 @@ def camera_embeddings(elevation, azimuth): return embeddings -class StableZero123_Conditioning: +class StableZero123_Conditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "clip_vision": ("CLIP_VISION",), - "init_image": ("IMAGE",), - "vae": ("VAE",), - "width": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), - "height": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - "elevation": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), - "azimuth": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), - }} - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") + def define_schema(cls): + return io.Schema( + node_id="StableZero123_Conditioning", + category="conditioning/3d_models", + inputs=[ + io.ClipVision.Input("clip_vision"), + io.Image.Input("init_image"), + io.Vae.Input("vae"), + io.Int.Input("width", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("height", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Float.Input("elevation", default=0.0, min=-180.0, max=180.0, step=0.1, round=False), + io.Float.Input("azimuth", default=0.0, min=-180.0, max=180.0, step=0.1, round=False) + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent") + ] + ) - FUNCTION = "encode" - - CATEGORY = "conditioning/3d_models" - - def encode(self, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth): + @classmethod + def execute(cls, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth) -> io.NodeOutput: output = clip_vision.encode_image(init_image) pooled = output.image_embeds.unsqueeze(0) pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1) @@ -51,30 +58,35 @@ class StableZero123_Conditioning: positive = [[cond, {"concat_latent_image": t}]] negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t)}]] latent = torch.zeros([batch_size, 4, height // 8, width // 8]) - return (positive, negative, {"samples":latent}) + return io.NodeOutput(positive, negative, {"samples":latent}) -class StableZero123_Conditioning_Batched: +class StableZero123_Conditioning_Batched(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "clip_vision": ("CLIP_VISION",), - "init_image": ("IMAGE",), - "vae": ("VAE",), - "width": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), - "height": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - "elevation": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), - "azimuth": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), - "elevation_batch_increment": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), - "azimuth_batch_increment": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), - }} - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") + def define_schema(cls): + return io.Schema( + node_id="StableZero123_Conditioning_Batched", + category="conditioning/3d_models", + inputs=[ + io.ClipVision.Input("clip_vision"), + io.Image.Input("init_image"), + io.Vae.Input("vae"), + io.Int.Input("width", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("height", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Float.Input("elevation", default=0.0, min=-180.0, max=180.0, step=0.1, round=False), + io.Float.Input("azimuth", default=0.0, min=-180.0, max=180.0, step=0.1, round=False), + io.Float.Input("elevation_batch_increment", default=0.0, min=-180.0, max=180.0, step=0.1, round=False), + io.Float.Input("azimuth_batch_increment", default=0.0, min=-180.0, max=180.0, step=0.1, round=False) + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent") + ] + ) - FUNCTION = "encode" - - CATEGORY = "conditioning/3d_models" - - def encode(self, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth, elevation_batch_increment, azimuth_batch_increment): + @classmethod + def execute(cls, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth, elevation_batch_increment, azimuth_batch_increment) -> io.NodeOutput: output = clip_vision.encode_image(init_image) pooled = output.image_embeds.unsqueeze(0) pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1) @@ -93,27 +105,32 @@ class StableZero123_Conditioning_Batched: positive = [[cond, {"concat_latent_image": t}]] negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t)}]] latent = torch.zeros([batch_size, 4, height // 8, width // 8]) - return (positive, negative, {"samples":latent, "batch_index": [0] * batch_size}) + return io.NodeOutput(positive, negative, {"samples":latent, "batch_index": [0] * batch_size}) -class SV3D_Conditioning: +class SV3D_Conditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "clip_vision": ("CLIP_VISION",), - "init_image": ("IMAGE",), - "vae": ("VAE",), - "width": ("INT", {"default": 576, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), - "height": ("INT", {"default": 576, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), - "video_frames": ("INT", {"default": 21, "min": 1, "max": 4096}), - "elevation": ("FLOAT", {"default": 0.0, "min": -90.0, "max": 90.0, "step": 0.1, "round": False}), - }} - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") + def define_schema(cls): + return io.Schema( + node_id="SV3D_Conditioning", + category="conditioning/3d_models", + inputs=[ + io.ClipVision.Input("clip_vision"), + io.Image.Input("init_image"), + io.Vae.Input("vae"), + io.Int.Input("width", default=576, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("height", default=576, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("video_frames", default=21, min=1, max=4096), + io.Float.Input("elevation", default=0.0, min=-90.0, max=90.0, step=0.1, round=False) + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent") + ] + ) - FUNCTION = "encode" - - CATEGORY = "conditioning/3d_models" - - def encode(self, clip_vision, init_image, vae, width, height, video_frames, elevation): + @classmethod + def execute(cls, clip_vision, init_image, vae, width, height, video_frames, elevation) -> io.NodeOutput: output = clip_vision.encode_image(init_image) pooled = output.image_embeds.unsqueeze(0) pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1) @@ -133,11 +150,17 @@ class SV3D_Conditioning: positive = [[pooled, {"concat_latent_image": t, "elevation": elevations, "azimuth": azimuths}]] negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t), "elevation": elevations, "azimuth": azimuths}]] latent = torch.zeros([video_frames, 4, height // 8, width // 8]) - return (positive, negative, {"samples":latent}) + return io.NodeOutput(positive, negative, {"samples":latent}) -NODE_CLASS_MAPPINGS = { - "StableZero123_Conditioning": StableZero123_Conditioning, - "StableZero123_Conditioning_Batched": StableZero123_Conditioning_Batched, - "SV3D_Conditioning": SV3D_Conditioning, -} +class Stable3DExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + StableZero123_Conditioning, + StableZero123_Conditioning_Batched, + SV3D_Conditioning, + ] + +async def comfy_entrypoint() -> Stable3DExtension: + return Stable3DExtension() From caf07331ff1b20f4104b9693ed244d6e22f80b5a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 4 Oct 2025 19:05:05 -0700 Subject: [PATCH 402/759] Remove soundfile dependency. No more torchaudio load or save. (#10210) --- comfy_extras/nodes_audio.py | 2 +- requirements.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 51c8b9dd9..1c868fcba 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -360,7 +360,7 @@ class RecordAudio: def load(self, audio): audio_path = folder_paths.get_annotated_filepath(audio) - waveform, sample_rate = torchaudio.load(audio_path) + waveform, sample_rate = load(audio_path) audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate} return (audio, ) diff --git a/requirements.txt b/requirements.txt index 588c5dcf0..6c28f9478 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,6 +25,5 @@ av>=14.2.0 #non essential dependencies: kornia>=0.7.1 spandrel -soundfile pydantic~=2.0 pydantic-settings~=2.0 From 187f43696dd58f252075d2e3c6873706eb6b5fa1 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 5 Oct 2025 09:34:18 +0300 Subject: [PATCH 403/759] fix(api-nodes): disable "std" mode for Kling2.5-turbo (#10212) --- comfy_api_nodes/nodes_kling.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index d8646f106..44fccc0c7 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -712,6 +712,9 @@ class KlingImage2VideoNode(KlingNodeBase): # Camera control type for image 2 video is always `simple` camera_control.type = KlingCameraControlType.simple + if mode == "std" and model_name == KlingVideoGenModelName.kling_v2_5_turbo.value: + mode = "pro" # October 5: currently "std" mode is not supported for this model + initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_IMAGE_TO_VIDEO, From 195e0b063950f585fe584c5ce7b0b689f8d20ff8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 5 Oct 2025 12:41:19 -0700 Subject: [PATCH 404/759] Remove useless code. (#10223) --- comfy/ldm/ace/vae/music_dcae_pipeline.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/comfy/ldm/ace/vae/music_dcae_pipeline.py b/comfy/ldm/ace/vae/music_dcae_pipeline.py index af81280eb..3c8830c17 100644 --- a/comfy/ldm/ace/vae/music_dcae_pipeline.py +++ b/comfy/ldm/ace/vae/music_dcae_pipeline.py @@ -23,8 +23,6 @@ class MusicDCAE(torch.nn.Module): else: self.source_sample_rate = source_sample_rate - # self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100) - self.transform = transforms.Compose([ transforms.Normalize(0.5, 0.5), ]) @@ -37,10 +35,6 @@ class MusicDCAE(torch.nn.Module): self.scale_factor = 0.1786 self.shift_factor = -1.9091 - def load_audio(self, audio_path): - audio, sr = torchaudio.load(audio_path) - return audio, sr - def forward_mel(self, audios): mels = [] for i in range(len(audios)): @@ -73,10 +67,8 @@ class MusicDCAE(torch.nn.Module): latent = self.dcae.encoder(mel.unsqueeze(0)) latents.append(latent) latents = torch.cat(latents, dim=0) - # latent_lengths = (audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple).long() latents = (latents - self.shift_factor) * self.scale_factor return latents - # return latents, latent_lengths @torch.no_grad() def decode(self, latents, audio_lengths=None, sr=None): @@ -91,9 +83,7 @@ class MusicDCAE(torch.nn.Module): wav = self.vocoder.decode(mels[0]).squeeze(1) if sr is not None: - # resampler = torchaudio.transforms.Resample(44100, sr).to(latents.device).to(latents.dtype) wav = torchaudio.functional.resample(wav, 44100, sr) - # wav = resampler(wav) else: sr = 44100 pred_wavs.append(wav) @@ -101,7 +91,6 @@ class MusicDCAE(torch.nn.Module): if audio_lengths is not None: pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)] return torch.stack(pred_wavs) - # return sr, pred_wavs def forward(self, audios, audio_lengths=None, sr=None): latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr) From 7326e46deeab97219cad32d0624991f9ffea4fe5 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 7 Oct 2025 01:57:00 +0800 Subject: [PATCH 405/759] Update template to 0.1.93 (#10235) * Update template to 0.1.92 * Update template to 0.1.93 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6c28f9478..db0486960 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.27.7 -comfyui-workflow-templates==0.1.91 +comfyui-workflow-templates==0.1.93 comfyui-embedded-docs==0.2.6 torch torchsde From 6bd3f8eb9ff2d7c74e8ca75ad1f854a6b256b714 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 6 Oct 2025 14:49:04 -0400 Subject: [PATCH 406/759] ComfyUI version 0.3.63 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index ac76fbe35..c3257d4bf 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.62" +__version__ = "0.3.63" diff --git a/pyproject.toml b/pyproject.toml index 383e7d10a..a9e3de0c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.62" +version = "0.3.63" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 6ae35158013e50698e680344ab1f54de0d59fef0 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 7 Oct 2025 02:05:57 +0300 Subject: [PATCH 407/759] fix(api-nodes): enable more pylint rules (#10213) --- comfy_api_nodes/apinode_utils.py | 2 +- comfy_api_nodes/nodes_moonvalley.py | 3 +-- comfy_api_nodes/nodes_recraft.py | 8 ++++---- pyproject.toml | 6 +----- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py index 37438f835..5ac3b92aa 100644 --- a/comfy_api_nodes/apinode_utils.py +++ b/comfy_api_nodes/apinode_utils.py @@ -152,7 +152,7 @@ def validate_aspect_ratio( raise TypeError( f"Aspect ratio cannot reduce to any less than {minimum_ratio_str} ({minimum_ratio}), but was {aspect_ratio} ({calculated_ratio})." ) - elif calculated_ratio > maximum_ratio: + if calculated_ratio > maximum_ratio: raise TypeError( f"Aspect ratio cannot reduce to any greater than {maximum_ratio_str} ({maximum_ratio}), but was {aspect_ratio} ({calculated_ratio})." ) diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 6467dd614..55471a69d 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -473,7 +473,7 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): height=width_height["height"], use_negative_prompts=True, ) - """Upload image to comfy backend to have a URL available for further processing""" + # Get MIME type from tensor - assuming PNG format for image tensors mime_type = "image/png" @@ -591,7 +591,6 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): validated_video = validate_video_to_video_input(video) video_url = await upload_video_to_comfyapi(validated_video, auth_kwargs=auth) - """Validate prompts and inference input""" validate_prompts(prompt, negative_prompt) # Only include motion_intensity for Motion Transfer diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index a006104b7..0bbb551b8 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -107,7 +107,7 @@ def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, co # if list already exists exists, just extend list with data for check_list in lists_to_check: for conv_tuple in check_list: - if conv_tuple[0] == parent_key and type(conv_tuple[1]) is list: + if conv_tuple[0] == parent_key and isinstance(conv_tuple[1], list): conv_tuple[1].append(formatter(data)) return True return False @@ -119,7 +119,7 @@ def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, co if formatter is None: formatter = lambda v: v # Multipart representation of value - if type(data) is not dict: + if not isinstance(data, dict): # if list already exists exists, just extend list with data added = handle_converted_lists(data, parent_key, converted_to_check) if added: @@ -136,9 +136,9 @@ def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, co for key, value in data.items(): current_key = key if parent_key is None else f"{parent_key}[{key}]" - if type(value) is dict: + if isinstance(value, dict): converted.extend(recraft_multipart_parser(value, current_key, formatter, next_check).items()) - elif type(value) is list: + elif isinstance(value, list): for ind, list_value in enumerate(value): iter_key = f"{current_key}[]" converted.extend(recraft_multipart_parser(list_value, iter_key, formatter, next_check, is_list=True).items()) diff --git a/pyproject.toml b/pyproject.toml index a9e3de0c6..abd1a5f5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,18 +57,14 @@ messages_control.disable = [ "redefined-builtin", "unnecessary-lambda", "dangerous-default-value", + "invalid-overridden-method", # next warnings should be fixed in future "bad-classmethod-argument", # Class method should have 'cls' as first argument "wrong-import-order", # Standard imports should be placed before third party imports "logging-fstring-interpolation", # Use lazy % formatting in logging functions "ungrouped-imports", "unnecessary-pass", - "unidiomatic-typecheck", "unnecessary-lambda-assignment", "no-else-return", - "no-else-raise", - "invalid-overridden-method", "unused-variable", - "pointless-string-statement", - "redefined-outer-name", ] From a49007a7b07abfdcb10bc10c23514c48935ea914 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 7 Oct 2025 02:13:43 +0300 Subject: [PATCH 408/759] fix(api-nodes): allow negative_prompt PixVerse to be multiline (#10196) --- comfy_api_nodes/nodes_pixverse.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy_api_nodes/nodes_pixverse.py b/comfy_api_nodes/nodes_pixverse.py index eb98e9653..2c91bbc65 100644 --- a/comfy_api_nodes/nodes_pixverse.py +++ b/comfy_api_nodes/nodes_pixverse.py @@ -146,7 +146,7 @@ class PixverseTextToVideoNode(comfy_io.ComfyNode): comfy_io.String.Input( "negative_prompt", default="", - force_input=True, + multiline=True, tooltip="An optional text description of undesired elements on an image.", optional=True, ), @@ -284,7 +284,7 @@ class PixverseImageToVideoNode(comfy_io.ComfyNode): comfy_io.String.Input( "negative_prompt", default="", - force_input=True, + multiline=True, tooltip="An optional text description of undesired elements on an image.", optional=True, ), @@ -425,7 +425,7 @@ class PixverseTransitionVideoNode(comfy_io.ComfyNode): comfy_io.String.Input( "negative_prompt", default="", - force_input=True, + multiline=True, tooltip="An optional text description of undesired elements on an image.", optional=True, ), From e77e0a8f8fdcdc53deb8207e0d5b16ca56824a4b Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 7 Oct 2025 02:20:26 +0300 Subject: [PATCH 409/759] convert nodes_pika.py to V3 schema (#10216) --- comfy_api_nodes/nodes_pika.py | 779 ++++++++++++++++------------------ 1 file changed, 373 insertions(+), 406 deletions(-) diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py index a8dc43cb3..0a9f04cc2 100644 --- a/comfy_api_nodes/nodes_pika.py +++ b/comfy_api_nodes/nodes_pika.py @@ -5,14 +5,16 @@ Pika API docs: https://pika-827374fb.mintlify.app/api-reference """ from __future__ import annotations -import io +from io import BytesIO import logging from typing import Optional, TypeVar +from enum import Enum import numpy as np import torch -from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeOptions +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io as comfy_io from comfy_api.input_impl import VideoFromFile from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoInput from comfy_api_nodes.apinode_utils import ( @@ -20,7 +22,6 @@ from comfy_api_nodes.apinode_utils import ( tensor_to_bytesio, ) from comfy_api_nodes.apis import ( - IngredientsMode, PikaBodyGenerate22C2vGenerate22PikascenesPost, PikaBodyGenerate22I2vGenerate22I2vPost, PikaBodyGenerate22KeyframeGenerate22PikaframesPost, @@ -28,10 +29,7 @@ from comfy_api_nodes.apis import ( PikaBodyGeneratePikadditionsGeneratePikadditionsPost, PikaBodyGeneratePikaffectsGeneratePikaffectsPost, PikaBodyGeneratePikaswapsGeneratePikaswapsPost, - PikaDurationEnum, - Pikaffect, PikaGenerateResponse, - PikaResolutionEnum, PikaVideoResponse, ) from comfy_api_nodes.apis.client import ( @@ -41,7 +39,6 @@ from comfy_api_nodes.apis.client import ( PollingOperation, SynchronousOperation, ) -from comfy_api_nodes.mapper_utils import model_field_to_node_input R = TypeVar("R") @@ -58,6 +55,35 @@ PATH_PIKASCENES = f"/proxy/pika/generate/{PIKA_API_VERSION}/pikascenes" PATH_VIDEO_GET = "/proxy/pika/videos" +class PikaDurationEnum(int, Enum): + integer_5 = 5 + integer_10 = 10 + + +class PikaResolutionEnum(str, Enum): + field_1080p = "1080p" + field_720p = "720p" + + +class Pikaffect(str, Enum): + Cake_ify = "Cake-ify" + Crumble = "Crumble" + Crush = "Crush" + Decapitate = "Decapitate" + Deflate = "Deflate" + Dissolve = "Dissolve" + Explode = "Explode" + Eye_pop = "Eye-pop" + Inflate = "Inflate" + Levitate = "Levitate" + Melt = "Melt" + Peel = "Peel" + Poke = "Poke" + Squish = "Squish" + Ta_da = "Ta-da" + Tear = "Tear" + + class PikaApiError(Exception): """Exception for Pika API errors.""" @@ -74,155 +100,121 @@ def is_valid_initial_response(response: PikaGenerateResponse) -> bool: return hasattr(response, "video_id") and response.video_id is not None -class PikaNodeBase(ComfyNodeABC): - """Base class for Pika nodes.""" +async def poll_for_task_status( + task_id: str, + auth_kwargs: Optional[dict[str, str]] = None, + node_id: Optional[str] = None, +) -> PikaGenerateResponse: + polling_operation = PollingOperation( + poll_endpoint=ApiEndpoint( + path=f"{PATH_VIDEO_GET}/{task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=PikaVideoResponse, + ), + completed_statuses=[ + "finished", + ], + failed_statuses=["failed", "cancelled"], + status_extractor=lambda response: ( + response.status.value if response.status else None + ), + progress_extractor=lambda response: ( + response.progress if hasattr(response, "progress") else None + ), + auth_kwargs=auth_kwargs, + result_url_extractor=lambda response: ( + response.url if hasattr(response, "url") else None + ), + node_id=node_id, + estimated_duration=60 + ) + return await polling_operation.execute() - @classmethod - def get_base_inputs_types( - cls, request_model - ) -> dict[str, tuple[IO, InputTypeOptions]]: - """Get the base required inputs types common to all Pika nodes.""" - return { - "prompt_text": model_field_to_node_input( - IO.STRING, - request_model, - "promptText", - multiline=True, - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - request_model, - "negativePrompt", - multiline=True, - ), - "seed": model_field_to_node_input( - IO.INT, - request_model, - "seed", - min=0, - max=0xFFFFFFFF, - control_after_generate=True, - ), - "resolution": model_field_to_node_input( - IO.COMBO, - request_model, - "resolution", - enum_type=PikaResolutionEnum, - ), - "duration": model_field_to_node_input( - IO.COMBO, - request_model, - "duration", - enum_type=PikaDurationEnum, - ), - } - CATEGORY = "api node/video/Pika" - API_NODE = True - FUNCTION = "api_call" - RETURN_TYPES = ("VIDEO",) +async def execute_task( + initial_operation: SynchronousOperation[R, PikaGenerateResponse], + auth_kwargs: Optional[dict[str, str]] = None, + node_id: Optional[str] = None, +) -> tuple[VideoFromFile]: + """Executes the initial operation then polls for the task status until it is completed. - async def poll_for_task_status( - self, - task_id: str, - auth_kwargs: Optional[dict[str, str]] = None, - node_id: Optional[str] = None, - ) -> PikaGenerateResponse: - polling_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"{PATH_VIDEO_GET}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=PikaVideoResponse, - ), - completed_statuses=[ - "finished", - ], - failed_statuses=["failed", "cancelled"], - status_extractor=lambda response: ( - response.status.value if response.status else None - ), - progress_extractor=lambda response: ( - response.progress if hasattr(response, "progress") else None - ), - auth_kwargs=auth_kwargs, - result_url_extractor=lambda response: ( - response.url if hasattr(response, "url") else None - ), - node_id=node_id, - estimated_duration=60 + Args: + initial_operation: The initial operation to execute. + auth_kwargs: The authentication token(s) to use for the API call. + + Returns: + A tuple containing the video file as a VIDEO output. + """ + initial_response = await initial_operation.execute() + if not is_valid_initial_response(initial_response): + error_msg = f"Pika initial request failed. Code: {initial_response.code}, Message: {initial_response.message}, Data: {initial_response.data}" + logging.error(error_msg) + raise PikaApiError(error_msg) + + task_id = initial_response.video_id + final_response = await poll_for_task_status(task_id, auth_kwargs, node_id=node_id) + if not is_valid_video_response(final_response): + error_msg = ( + f"Pika task {task_id} succeeded but no video data found in response." ) - return await polling_operation.execute() + logging.error(error_msg) + raise PikaApiError(error_msg) - async def execute_task( - self, - initial_operation: SynchronousOperation[R, PikaGenerateResponse], - auth_kwargs: Optional[dict[str, str]] = None, - node_id: Optional[str] = None, - ) -> tuple[VideoFromFile]: - """Executes the initial operation then polls for the task status until it is completed. + video_url = str(final_response.url) + logging.info("Pika task %s succeeded. Video URL: %s", task_id, video_url) - Args: - initial_operation: The initial operation to execute. - auth_kwargs: The authentication token(s) to use for the API call. - - Returns: - A tuple containing the video file as a VIDEO output. - """ - initial_response = await initial_operation.execute() - if not is_valid_initial_response(initial_response): - error_msg = f"Pika initial request failed. Code: {initial_response.code}, Message: {initial_response.message}, Data: {initial_response.data}" - logging.error(error_msg) - raise PikaApiError(error_msg) - - task_id = initial_response.video_id - final_response = await self.poll_for_task_status(task_id, auth_kwargs) - if not is_valid_video_response(final_response): - error_msg = ( - f"Pika task {task_id} succeeded but no video data found in response." - ) - logging.error(error_msg) - raise PikaApiError(error_msg) - - video_url = str(final_response.url) - logging.info("Pika task %s succeeded. Video URL: %s", task_id, video_url) - - return (await download_url_to_video_output(video_url),) + return (await download_url_to_video_output(video_url),) -class PikaImageToVideoV2_2(PikaNodeBase): +def get_base_inputs_types() -> list[comfy_io.Input]: + """Get the base required inputs types common to all Pika nodes.""" + return [ + comfy_io.String.Input("prompt_text", multiline=True), + comfy_io.String.Input("negative_prompt", multiline=True), + comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), + comfy_io.Combo.Input( + "resolution", options=[resolution.value for resolution in PikaResolutionEnum], default="1080p" + ), + comfy_io.Combo.Input( + "duration", options=[duration.value for duration in PikaDurationEnum], default=5 + ), + ] + + +class PikaImageToVideoV2_2(comfy_io.ComfyNode): """Pika 2.2 Image to Video Node.""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "image": ( - IO.IMAGE, - {"tooltip": "The image to convert to video"}, - ), - **cls.get_base_inputs_types(PikaBodyGenerate22I2vGenerate22I2vPost), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="PikaImageToVideoNode2_2", + display_name="Pika Image to Video", + description="Sends an image and prompt to the Pika API v2.2 to generate a video.", + category="api node/video/Pika", + inputs=[ + comfy_io.Image.Input("image", tooltip="The image to convert to video"), + *get_base_inputs_types(), + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Sends an image and prompt to the Pika API v2.2 to generate a video." - - async def api_call( - self, + @classmethod + async def execute( + cls, image: torch.Tensor, prompt_text: str, negative_prompt: str, seed: int, resolution: str, duration: int, - unique_id: str, - **kwargs, - ) -> tuple[VideoFromFile]: + ) -> comfy_io.NodeOutput: # Convert image to BytesIO image_bytes_io = tensor_to_bytesio(image) image_bytes_io.seek(0) @@ -237,7 +229,10 @@ class PikaImageToVideoV2_2(PikaNodeBase): resolution=resolution, duration=duration, ) - + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_IMAGE_TO_VIDEO, @@ -248,50 +243,55 @@ class PikaImageToVideoV2_2(PikaNodeBase): request=pika_request_data, files=pika_files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) - - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaTextToVideoNodeV2_2(PikaNodeBase): +class PikaTextToVideoNodeV2_2(comfy_io.ComfyNode): """Pika Text2Video v2.2 Node.""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - **cls.get_base_inputs_types(PikaBodyGenerate22T2vGenerate22T2vPost), - "aspect_ratio": model_field_to_node_input( - IO.FLOAT, - PikaBodyGenerate22T2vGenerate22T2vPost, - "aspectRatio", + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="PikaTextToVideoNode2_2", + display_name="Pika Text to Video", + description="Sends a text prompt to the Pika API v2.2 to generate a video.", + category="api node/video/Pika", + inputs=[ + *get_base_inputs_types(), + comfy_io.Float.Input( + "aspect_ratio", step=0.001, min=0.4, max=2.5, default=1.7777777777777777, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + tooltip="Aspect ratio (width / height)", + ) + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Sends a text prompt to the Pika API v2.2 to generate a video." - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt_text: str, negative_prompt: str, seed: int, resolution: str, duration: int, aspect_ratio: float, - unique_id: str, - **kwargs, - ) -> tuple[VideoFromFile]: + ) -> comfy_io.NodeOutput: + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_TEXT_TO_VIDEO, @@ -307,62 +307,75 @@ class PikaTextToVideoNodeV2_2(PikaNodeBase): duration=duration, aspectRatio=aspect_ratio, ), - auth_kwargs=kwargs, + auth_kwargs=auth, content_type="application/x-www-form-urlencoded", ) - - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaScenesV2_2(PikaNodeBase): +class PikaScenesV2_2(comfy_io.ComfyNode): """PikaScenes v2.2 Node.""" @classmethod - def INPUT_TYPES(cls): - image_ingredient_input = ( - IO.IMAGE, - {"tooltip": "Image that will be used as ingredient to create a video."}, - ) - return { - "required": { - **cls.get_base_inputs_types( - PikaBodyGenerate22C2vGenerate22PikascenesPost, - ), - "ingredients_mode": model_field_to_node_input( - IO.COMBO, - PikaBodyGenerate22C2vGenerate22PikascenesPost, - "ingredientsMode", - enum_type=IngredientsMode, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="PikaScenesV2_2", + display_name="Pika Scenes (Video Image Composition)", + description="Combine your images to create a video with the objects in them. Upload multiple images as ingredients and generate a high-quality video that incorporates all of them.", + category="api node/video/Pika", + inputs=[ + *get_base_inputs_types(), + comfy_io.Combo.Input( + "ingredients_mode", + options=["creative", "precise"], default="creative", ), - "aspect_ratio": model_field_to_node_input( - IO.FLOAT, - PikaBodyGenerate22C2vGenerate22PikascenesPost, - "aspectRatio", + comfy_io.Float.Input( + "aspect_ratio", step=0.001, min=0.4, max=2.5, default=1.7777777777777777, + tooltip="Aspect ratio (width / height)", ), - }, - "optional": { - "image_ingredient_1": image_ingredient_input, - "image_ingredient_2": image_ingredient_input, - "image_ingredient_3": image_ingredient_input, - "image_ingredient_4": image_ingredient_input, - "image_ingredient_5": image_ingredient_input, - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + comfy_io.Image.Input( + "image_ingredient_1", + optional=True, + tooltip="Image that will be used as ingredient to create a video.", + ), + comfy_io.Image.Input( + "image_ingredient_2", + optional=True, + tooltip="Image that will be used as ingredient to create a video.", + ), + comfy_io.Image.Input( + "image_ingredient_3", + optional=True, + tooltip="Image that will be used as ingredient to create a video.", + ), + comfy_io.Image.Input( + "image_ingredient_4", + optional=True, + tooltip="Image that will be used as ingredient to create a video.", + ), + comfy_io.Image.Input( + "image_ingredient_5", + optional=True, + tooltip="Image that will be used as ingredient to create a video.", + ), + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Combine your images to create a video with the objects in them. Upload multiple images as ingredients and generate a high-quality video that incorporates all of them." - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt_text: str, negative_prompt: str, seed: int, @@ -370,14 +383,12 @@ class PikaScenesV2_2(PikaNodeBase): duration: int, ingredients_mode: str, aspect_ratio: float, - unique_id: str, image_ingredient_1: Optional[torch.Tensor] = None, image_ingredient_2: Optional[torch.Tensor] = None, image_ingredient_3: Optional[torch.Tensor] = None, image_ingredient_4: Optional[torch.Tensor] = None, image_ingredient_5: Optional[torch.Tensor] = None, - **kwargs, - ) -> tuple[VideoFromFile]: + ) -> comfy_io.NodeOutput: # Convert all passed images to BytesIO all_image_bytes_io = [] for image in [ @@ -406,7 +417,10 @@ class PikaScenesV2_2(PikaNodeBase): duration=duration, aspectRatio=aspect_ratio, ) - + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_PIKASCENES, @@ -417,63 +431,54 @@ class PikaScenesV2_2(PikaNodeBase): request=pika_request_data, files=pika_files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikAdditionsNode(PikaNodeBase): +class PikAdditionsNode(comfy_io.ComfyNode): """Pika Pikadditions Node. Add an image into a video.""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "video": (IO.VIDEO, {"tooltip": "The video to add an image to."}), - "image": (IO.IMAGE, {"tooltip": "The image to add to the video."}), - "prompt_text": model_field_to_node_input( - IO.STRING, - PikaBodyGeneratePikadditionsGeneratePikadditionsPost, - "promptText", - multiline=True, - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - PikaBodyGeneratePikadditionsGeneratePikadditionsPost, - "negativePrompt", - multiline=True, - ), - "seed": model_field_to_node_input( - IO.INT, - PikaBodyGeneratePikadditionsGeneratePikadditionsPost, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="Pikadditions", + display_name="Pikadditions (Video Object Insertion)", + description="Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result.", + category="api node/video/Pika", + inputs=[ + comfy_io.Video.Input("video", tooltip="The video to add an image to."), + comfy_io.Image.Input("image", tooltip="The image to add to the video."), + comfy_io.String.Input("prompt_text", multiline=True), + comfy_io.String.Input("negative_prompt", multiline=True), + comfy_io.Int.Input( "seed", min=0, max=0xFFFFFFFF, control_after_generate=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result." - - async def api_call( - self, + @classmethod + async def execute( + cls, video: VideoInput, image: torch.Tensor, prompt_text: str, negative_prompt: str, seed: int, - unique_id: str, - **kwargs, - ) -> tuple[VideoFromFile]: + ) -> comfy_io.NodeOutput: # Convert video to BytesIO - video_bytes_io = io.BytesIO() + video_bytes_io = BytesIO() video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264) video_bytes_io.seek(0) @@ -492,7 +497,10 @@ class PikAdditionsNode(PikaNodeBase): negativePrompt=negative_prompt, seed=seed, ) - + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_PIKADDITIONS, @@ -503,74 +511,51 @@ class PikAdditionsNode(PikaNodeBase): request=pika_request_data, files=pika_files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaSwapsNode(PikaNodeBase): +class PikaSwapsNode(comfy_io.ComfyNode): """Pika Pikaswaps Node.""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "video": (IO.VIDEO, {"tooltip": "The video to swap an object in."}), - "image": ( - IO.IMAGE, - { - "tooltip": "The image used to replace the masked object in the video." - }, - ), - "mask": ( - IO.MASK, - {"tooltip": "Use the mask to define areas in the video to replace"}, - ), - "prompt_text": model_field_to_node_input( - IO.STRING, - PikaBodyGeneratePikaswapsGeneratePikaswapsPost, - "promptText", - multiline=True, - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - PikaBodyGeneratePikaswapsGeneratePikaswapsPost, - "negativePrompt", - multiline=True, - ), - "seed": model_field_to_node_input( - IO.INT, - PikaBodyGeneratePikaswapsGeneratePikaswapsPost, - "seed", - min=0, - max=0xFFFFFFFF, - control_after_generate=True, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="Pikaswaps", + display_name="Pika Swaps (Video Object Replacement)", + description="Swap out any object or region of your video with a new image or object. Define areas to replace either with a mask or coordinates.", + category="api node/video/Pika", + inputs=[ + comfy_io.Video.Input("video", tooltip="The video to swap an object in."), + comfy_io.Image.Input("image", tooltip="The image used to replace the masked object in the video."), + comfy_io.Mask.Input("mask", tooltip="Use the mask to define areas in the video to replace"), + comfy_io.String.Input("prompt_text", multiline=True), + comfy_io.String.Input("negative_prompt", multiline=True), + comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Swap out any object or region of your video with a new image or object. Define areas to replace either with a mask or coordinates." - RETURN_TYPES = ("VIDEO",) - - async def api_call( - self, + @classmethod + async def execute( + cls, video: VideoInput, image: torch.Tensor, mask: torch.Tensor, prompt_text: str, negative_prompt: str, seed: int, - unique_id: str, - **kwargs, - ) -> tuple[VideoFromFile]: + ) -> comfy_io.NodeOutput: # Convert video to BytesIO - video_bytes_io = io.BytesIO() + video_bytes_io = BytesIO() video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264) video_bytes_io.seek(0) @@ -579,7 +564,7 @@ class PikaSwapsNode(PikaNodeBase): mask = mask.repeat(1, 3, 1, 1) # Convert 3-channel binary mask to BytesIO - mask_bytes_io = io.BytesIO() + mask_bytes_io = BytesIO() mask_bytes_io.write(mask.numpy().astype(np.uint8)) mask_bytes_io.seek(0) @@ -599,7 +584,10 @@ class PikaSwapsNode(PikaNodeBase): negativePrompt=negative_prompt, seed=seed, ) - + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_PIKADDITIONS, @@ -610,71 +598,52 @@ class PikaSwapsNode(PikaNodeBase): request=pika_request_data, files=pika_files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) - - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaffectsNode(PikaNodeBase): +class PikaffectsNode(comfy_io.ComfyNode): """Pika Pikaffects Node.""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "image": ( - IO.IMAGE, - {"tooltip": "The reference image to apply the Pikaffect to."}, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="Pikaffects", + display_name="Pikaffects (Video Effects)", + description="Generate a video with a specific Pikaffect. Supported Pikaffects: Cake-ify, Crumble, Crush, Decapitate, Deflate, Dissolve, Explode, Eye-pop, Inflate, Levitate, Melt, Peel, Poke, Squish, Ta-da, Tear", + category="api node/video/Pika", + inputs=[ + comfy_io.Image.Input("image", tooltip="The reference image to apply the Pikaffect to."), + comfy_io.Combo.Input( + "pikaffect", options=[pikaffect.value for pikaffect in Pikaffect], default="Cake-ify" ), - "pikaffect": model_field_to_node_input( - IO.COMBO, - PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - "pikaffect", - enum_type=Pikaffect, - default="Cake-ify", - ), - "prompt_text": model_field_to_node_input( - IO.STRING, - PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - "promptText", - multiline=True, - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - "negativePrompt", - multiline=True, - ), - "seed": model_field_to_node_input( - IO.INT, - PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - "seed", - min=0, - max=0xFFFFFFFF, - control_after_generate=True, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + comfy_io.String.Input("prompt_text", multiline=True), + comfy_io.String.Input("negative_prompt", multiline=True), + comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Generate a video with a specific Pikaffect. Supported Pikaffects: Cake-ify, Crumble, Crush, Decapitate, Deflate, Dissolve, Explode, Eye-pop, Inflate, Levitate, Melt, Peel, Poke, Squish, Ta-da, Tear" - - async def api_call( - self, + @classmethod + async def execute( + cls, image: torch.Tensor, pikaffect: str, prompt_text: str, negative_prompt: str, seed: int, - unique_id: str, - **kwargs, - ) -> tuple[VideoFromFile]: - + ) -> comfy_io.NodeOutput: + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_PIKAFFECTS, @@ -690,36 +659,38 @@ class PikaffectsNode(PikaNodeBase): ), files={"image": ("image.png", tensor_to_bytesio(image), "image/png")}, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) - - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaStartEndFrameNode2_2(PikaNodeBase): +class PikaStartEndFrameNode2_2(comfy_io.ComfyNode): """PikaFrames v2.2 Node.""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "image_start": (IO.IMAGE, {"tooltip": "The first image to combine."}), - "image_end": (IO.IMAGE, {"tooltip": "The last image to combine."}), - **cls.get_base_inputs_types( - PikaBodyGenerate22KeyframeGenerate22PikaframesPost - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="PikaStartEndFrameNode2_2", + display_name="Pika Start and End Frame to Video", + description="Generate a video by combining your first and last frame. Upload two images to define the start and end points, and let the AI create a smooth transition between them.", + category="api node/video/Pika", + inputs=[ + comfy_io.Image.Input("image_start", tooltip="The first image to combine."), + comfy_io.Image.Input("image_end", tooltip="The last image to combine."), + *get_base_inputs_types(), + ], + outputs=[comfy_io.Video.Output()], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Generate a video by combining your first and last frame. Upload two images to define the start and end points, and let the AI create a smooth transition between them." - - async def api_call( - self, + @classmethod + async def execute( + cls, image_start: torch.Tensor, image_end: torch.Tensor, prompt_text: str, @@ -727,15 +698,15 @@ class PikaStartEndFrameNode2_2(PikaNodeBase): seed: int, resolution: str, duration: int, - unique_id: str, - **kwargs, - ) -> tuple[VideoFromFile]: - + ) -> comfy_io.NodeOutput: pika_files = [ ("keyFrames", ("image_start.png", tensor_to_bytesio(image_start), "image/png")), ("keyFrames", ("image_end.png", tensor_to_bytesio(image_end), "image/png")), ] - + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_PIKAFRAMES, @@ -752,28 +723,24 @@ class PikaStartEndFrameNode2_2(PikaNodeBase): ), files=pika_files, content_type="multipart/form-data", - auth_kwargs=kwargs, + auth_kwargs=auth, ) - - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -NODE_CLASS_MAPPINGS = { - "PikaImageToVideoNode2_2": PikaImageToVideoV2_2, - "PikaTextToVideoNode2_2": PikaTextToVideoNodeV2_2, - "PikaScenesV2_2": PikaScenesV2_2, - "Pikadditions": PikAdditionsNode, - "Pikaswaps": PikaSwapsNode, - "Pikaffects": PikaffectsNode, - "PikaStartEndFrameNode2_2": PikaStartEndFrameNode2_2, -} +class PikaApiNodesExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + PikaImageToVideoV2_2, + PikaTextToVideoNodeV2_2, + PikaScenesV2_2, + PikAdditionsNode, + PikaSwapsNode, + PikaffectsNode, + PikaStartEndFrameNode2_2, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "PikaImageToVideoNode2_2": "Pika Image to Video", - "PikaTextToVideoNode2_2": "Pika Text to Video", - "PikaScenesV2_2": "Pika Scenes (Video Image Composition)", - "Pikadditions": "Pikadditions (Video Object Insertion)", - "Pikaswaps": "Pika Swaps (Video Object Replacement)", - "Pikaffects": "Pikaffects (Video Effects)", - "PikaStartEndFrameNode2_2": "Pika Start and End Frame to Video", -} + +async def comfy_entrypoint() -> PikaApiNodesExtension: + return PikaApiNodesExtension() From 8c1991042795d06c7ccfd5d1931eb994044c75ef Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 7 Oct 2025 02:26:52 +0300 Subject: [PATCH 410/759] convert nodes_kling.py to V3 schema (#10236) --- comfy_api_nodes/nodes_kling.py | 2146 +++++++++++++++----------------- 1 file changed, 1032 insertions(+), 1114 deletions(-) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 44fccc0c7..457b43451 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -10,6 +10,8 @@ from collections.abc import Callable import math import logging +from typing_extensions import override + import torch from comfy_api_nodes.apis import ( @@ -63,8 +65,8 @@ from comfy_api_nodes.apinode_utils import ( upload_video_to_comfyapi, upload_audio_to_comfyapi, download_url_to_image_tensor, + validate_string, ) -from comfy_api_nodes.mapper_utils import model_field_to_node_input from comfy_api_nodes.util.validation_utils import ( validate_image_dimensions, validate_image_aspect_ratio, @@ -73,8 +75,7 @@ from comfy_api_nodes.util.validation_utils import ( ) from comfy_api.input.basic_types import AudioInput from comfy_api.input.video_types import VideoInput -from comfy_api.input_impl import VideoFromFile -from comfy.comfy_types.node_typing import IO, InputTypeOptions, ComfyNodeABC +from comfy_api.latest import ComfyExtension, io as comfy_io KLING_API_VERSION = "v1" PATH_TEXT_TO_VIDEO = f"/proxy/kling/{KLING_API_VERSION}/videos/text2video" @@ -103,10 +104,113 @@ AVERAGE_DURATION_VIDEO_EXTEND = 320 R = TypeVar("R") -class KlingApiError(Exception): - """Base exception for Kling API errors.""" +MODE_TEXT2VIDEO = { + "standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"), + "standard mode / 10s duration / kling-v1": ("std", "10", "kling-v1"), + "pro mode / 5s duration / kling-v1": ("pro", "5", "kling-v1"), + "pro mode / 10s duration / kling-v1": ("pro", "10", "kling-v1"), + "standard mode / 5s duration / kling-v1-6": ("std", "5", "kling-v1-6"), + "standard mode / 10s duration / kling-v1-6": ("std", "10", "kling-v1-6"), + "pro mode / 5s duration / kling-v2-master": ("pro", "5", "kling-v2-master"), + "pro mode / 10s duration / kling-v2-master": ("pro", "10", "kling-v2-master"), + "standard mode / 5s duration / kling-v2-master": ("std", "5", "kling-v2-master"), + "standard mode / 10s duration / kling-v2-master": ("std", "10", "kling-v2-master"), + "pro mode / 5s duration / kling-v2-1-master": ("pro", "5", "kling-v2-1-master"), + "pro mode / 10s duration / kling-v2-1-master": ("pro", "10", "kling-v2-1-master"), + "pro mode / 5s duration / kling-v2-5-turbo": ("pro", "5", "kling-v2-5-turbo"), + "pro mode / 10s duration / kling-v2-5-turbo": ("pro", "10", "kling-v2-5-turbo"), +} +""" +Mapping of mode strings to their corresponding (mode, duration, model_name) tuples. +Only includes config combos that support the `image_tail` request field. - pass +See: [Kling API Docs Capability Map](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap) +""" + + +MODE_START_END_FRAME = { + "standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"), + "pro mode / 5s duration / kling-v1": ("pro", "5", "kling-v1"), + "pro mode / 5s duration / kling-v1-5": ("pro", "5", "kling-v1-5"), + "pro mode / 10s duration / kling-v1-5": ("pro", "10", "kling-v1-5"), + "pro mode / 5s duration / kling-v1-6": ("pro", "5", "kling-v1-6"), + "pro mode / 10s duration / kling-v1-6": ("pro", "10", "kling-v1-6"), + "pro mode / 5s duration / kling-v2-1": ("pro", "5", "kling-v2-1"), + "pro mode / 10s duration / kling-v2-1": ("pro", "10", "kling-v2-1"), +} +""" +Returns a mapping of mode strings to their corresponding (mode, duration, model_name) tuples. +Only includes config combos that support the `image_tail` request field. + +See: [Kling API Docs Capability Map](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap) +""" + + +VOICES_CONFIG = { + # English voices + "Melody": ("girlfriend_4_speech02", "en"), + "Sunny": ("genshin_vindi2", "en"), + "Sage": ("zhinen_xuesheng", "en"), + "Ace": ("AOT", "en"), + "Blossom": ("ai_shatang", "en"), + "Peppy": ("genshin_klee2", "en"), + "Dove": ("genshin_kirara", "en"), + "Shine": ("ai_kaiya", "en"), + "Anchor": ("oversea_male1", "en"), + "Lyric": ("ai_chenjiahao_712", "en"), + "Tender": ("chat1_female_new-3", "en"), + "Siren": ("chat_0407_5-1", "en"), + "Zippy": ("cartoon-boy-07", "en"), + "Bud": ("uk_boy1", "en"), + "Sprite": ("cartoon-girl-01", "en"), + "Candy": ("PeppaPig_platform", "en"), + "Beacon": ("ai_huangzhong_712", "en"), + "Rock": ("ai_huangyaoshi_712", "en"), + "Titan": ("ai_laoguowang_712", "en"), + "Grace": ("chengshu_jiejie", "en"), + "Helen": ("you_pingjing", "en"), + "Lore": ("calm_story1", "en"), + "Crag": ("uk_man2", "en"), + "Prattle": ("laopopo_speech02", "en"), + "Hearth": ("heainainai_speech02", "en"), + "The Reader": ("reader_en_m-v1", "en"), + "Commercial Lady": ("commercial_lady_en_f-v1", "en"), + # Chinese voices + "阳光少年": ("genshin_vindi2", "zh"), + "懂事小弟": ("zhinen_xuesheng", "zh"), + "运动少年": ("tiyuxi_xuedi", "zh"), + "青春少女": ("ai_shatang", "zh"), + "温柔小妹": ("genshin_klee2", "zh"), + "元气少女": ("genshin_kirara", "zh"), + "阳光男生": ("ai_kaiya", "zh"), + "幽默小哥": ("tiexin_nanyou", "zh"), + "文艺小哥": ("ai_chenjiahao_712", "zh"), + "甜美邻家": ("girlfriend_1_speech02", "zh"), + "温柔姐姐": ("chat1_female_new-3", "zh"), + "职场女青": ("girlfriend_2_speech02", "zh"), + "活泼男童": ("cartoon-boy-07", "zh"), + "俏皮女童": ("cartoon-girl-01", "zh"), + "稳重老爸": ("ai_huangyaoshi_712", "zh"), + "温柔妈妈": ("you_pingjing", "zh"), + "严肃上司": ("ai_laoguowang_712", "zh"), + "优雅贵妇": ("chengshu_jiejie", "zh"), + "慈祥爷爷": ("zhuxi_speech02", "zh"), + "唠叨爷爷": ("uk_oldman3", "zh"), + "唠叨奶奶": ("laopopo_speech02", "zh"), + "和蔼奶奶": ("heainainai_speech02", "zh"), + "东北老铁": ("dongbeilaotie_speech02", "zh"), + "重庆小伙": ("chongqingxiaohuo_speech02", "zh"), + "四川妹子": ("chuanmeizi_speech02", "zh"), + "潮汕大叔": ("chaoshandashu_speech02", "zh"), + "台湾男生": ("ai_taiwan_man2_speech02", "zh"), + "西安掌柜": ("xianzhanggui_speech02", "zh"), + "天津姐姐": ("tianjinjiejie_speech02", "zh"), + "新闻播报男": ("diyinnansang_DB_CN_M_04-v2", "zh"), + "译制片男": ("yizhipiannan-v1", "zh"), + "撒娇女友": ("tianmeixuemei-v1", "zh"), + "刀片烟嗓": ("daopianyansang-v1", "zh"), + "乖巧正太": ("mengwa-v1", "zh"), +} async def poll_until_finished( @@ -142,11 +246,6 @@ def is_valid_camera_control_configs(configs: list[float]) -> bool: return any(not math.isclose(value, 0.0) for value in configs) -def is_valid_prompt(prompt: str) -> bool: - """Verifies that the prompt is not empty.""" - return bool(prompt) - - def is_valid_task_creation_response(response: KlingText2VideoResponse) -> bool: """Verifies that the initial response contains a task ID.""" return bool(response.data.task_id) @@ -190,7 +289,7 @@ def validate_task_creation_response(response) -> None: if not is_valid_task_creation_response(response): error_msg = f"Kling initial request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}" logging.error(error_msg) - raise KlingApiError(error_msg) + raise Exception(error_msg) def validate_video_result_response(response) -> None: @@ -198,7 +297,7 @@ def validate_video_result_response(response) -> None: if not is_valid_video_response(response): error_msg = f"Kling task {response.data.task_id} succeeded but no video data found in response." logging.error(f"Error: {error_msg}.\nResponse: {response}") - raise KlingApiError(error_msg) + raise Exception(error_msg) def validate_image_result_response(response) -> None: @@ -206,7 +305,7 @@ def validate_image_result_response(response) -> None: if not is_valid_image_response(response): error_msg = f"Kling task {response.data.task_id} succeeded but no image data found in response." logging.error(f"Error: {error_msg}.\nResponse: {response}") - raise KlingApiError(error_msg) + raise Exception(error_msg) def validate_input_image(image: torch.Tensor) -> None: @@ -221,21 +320,6 @@ def validate_input_image(image: torch.Tensor) -> None: validate_image_aspect_ratio(image, min_aspect_ratio=1 / 2.5, max_aspect_ratio=2.5) -def get_camera_control_input_config( - tooltip: str, default: float = 0.0 -) -> tuple[IO, InputTypeOptions]: - """Returns common InputTypeOptions for Kling camera control configurations.""" - input_config = { - "default": default, - "min": -10.0, - "max": 10.0, - "step": 0.25, - "display": "slider", - "tooltip": tooltip, - } - return IO.FLOAT, input_config - - def get_video_from_response(response) -> KlingVideoResult: """Returns the first video object from the Kling video generation task result. Will raise an error if the response is not valid. @@ -278,17 +362,6 @@ def get_images_urls_from_response(response) -> Optional[str]: return None -async def video_result_to_node_output( - video: KlingVideoResult, -) -> tuple[VideoFromFile, str, str]: - """Converts a KlingVideoResult to a tuple of (VideoFromFile, str, str) to be used as a ComfyUI node output.""" - return ( - await download_url_to_video_output(str(video.url)), - str(video.id), - str(video.duration), - ) - - async def image_result_to_node_output( images: list[KlingImageResult], ) -> torch.Tensor: @@ -302,57 +375,339 @@ async def image_result_to_node_output( return torch.cat([await download_url_to_image_tensor(str(image.url)) for image in images]) -class KlingNodeBase(ComfyNodeABC): - """Base class for Kling nodes.""" +async def execute_text2video( + auth_kwargs: dict[str, str], + node_id: str, + prompt: str, + negative_prompt: str, + cfg_scale: float, + model_name: str, + model_mode: str, + duration: str, + aspect_ratio: str, + camera_control: Optional[KlingCameraControl] = None, +) -> comfy_io.NodeOutput: + validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V) + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path=PATH_TEXT_TO_VIDEO, + method=HttpMethod.POST, + request_model=KlingText2VideoRequest, + response_model=KlingText2VideoResponse, + ), + request=KlingText2VideoRequest( + prompt=prompt if prompt else None, + negative_prompt=negative_prompt if negative_prompt else None, + duration=KlingVideoGenDuration(duration), + mode=KlingVideoGenMode(model_mode), + model_name=KlingVideoGenModelName(model_name), + cfg_scale=cfg_scale, + aspect_ratio=KlingVideoGenAspectRatio(aspect_ratio), + camera_control=camera_control, + ), + auth_kwargs=auth_kwargs, + ) - FUNCTION = "api_call" - CATEGORY = "api node/video/Kling" - API_NODE = True + task_creation_response = await initial_operation.execute() + validate_task_creation_response(task_creation_response) + + task_id = task_creation_response.data.task_id + final_response = await poll_until_finished( + auth_kwargs, + ApiEndpoint( + path=f"{PATH_TEXT_TO_VIDEO}/{task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=KlingText2VideoResponse, + ), + result_url_extractor=get_video_url_from_response, + estimated_duration=AVERAGE_DURATION_T2V, + node_id=node_id, + ) + validate_video_result_response(final_response) + + video = get_video_from_response(final_response) + return comfy_io.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) -class KlingCameraControls(KlingNodeBase): +async def execute_image2video( + auth_kwargs: dict[str, str], + node_id: str, + start_frame: torch.Tensor, + prompt: str, + negative_prompt: str, + model_name: str, + cfg_scale: float, + model_mode: str, + aspect_ratio: str, + duration: str, + camera_control: Optional[KlingCameraControl] = None, + end_frame: Optional[torch.Tensor] = None, +) -> comfy_io.NodeOutput: + validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_I2V) + validate_input_image(start_frame) + + if camera_control is not None: + # Camera control type for image 2 video is always `simple` + camera_control.type = KlingCameraControlType.simple + + if model_mode == "std" and model_name == KlingVideoGenModelName.kling_v2_5_turbo.value: + model_mode = "pro" # October 5: currently "std" mode is not supported for this model + + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path=PATH_IMAGE_TO_VIDEO, + method=HttpMethod.POST, + request_model=KlingImage2VideoRequest, + response_model=KlingImage2VideoResponse, + ), + request=KlingImage2VideoRequest( + model_name=KlingVideoGenModelName(model_name), + image=tensor_to_base64_string(start_frame), + image_tail=( + tensor_to_base64_string(end_frame) + if end_frame is not None + else None + ), + prompt=prompt, + negative_prompt=negative_prompt if negative_prompt else None, + cfg_scale=cfg_scale, + mode=KlingVideoGenMode(model_mode), + duration=KlingVideoGenDuration(duration), + camera_control=camera_control, + ), + auth_kwargs=auth_kwargs, + ) + + task_creation_response = await initial_operation.execute() + validate_task_creation_response(task_creation_response) + task_id = task_creation_response.data.task_id + + final_response = await poll_until_finished( + auth_kwargs, + ApiEndpoint( + path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}", + method=HttpMethod.GET, + request_model=KlingImage2VideoRequest, + response_model=KlingImage2VideoResponse, + ), + result_url_extractor=get_video_url_from_response, + estimated_duration=AVERAGE_DURATION_I2V, + node_id=node_id, + ) + validate_video_result_response(final_response) + + video = get_video_from_response(final_response) + return comfy_io.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) + + +async def execute_video_effect( + auth_kwargs: dict[str, str], + node_id: str, + dual_character: bool, + effect_scene: KlingDualCharacterEffectsScene | KlingSingleImageEffectsScene, + model_name: str, + duration: KlingVideoGenDuration, + image_1: torch.Tensor, + image_2: Optional[torch.Tensor] = None, + model_mode: Optional[KlingVideoGenMode] = None, +) -> comfy_io.NodeOutput: + if dual_character: + request_input_field = KlingDualCharacterEffectInput( + model_name=model_name, + mode=model_mode, + images=[ + tensor_to_base64_string(image_1), + tensor_to_base64_string(image_2), + ], + duration=duration, + ) + else: + request_input_field = KlingSingleImageEffectInput( + model_name=model_name, + image=tensor_to_base64_string(image_1), + duration=duration, + ) + + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path=PATH_VIDEO_EFFECTS, + method=HttpMethod.POST, + request_model=KlingVideoEffectsRequest, + response_model=KlingVideoEffectsResponse, + ), + request=KlingVideoEffectsRequest( + effect_scene=effect_scene, + input=request_input_field, + ), + auth_kwargs=auth_kwargs, + ) + + task_creation_response = await initial_operation.execute() + validate_task_creation_response(task_creation_response) + task_id = task_creation_response.data.task_id + + final_response = await poll_until_finished( + auth_kwargs, + ApiEndpoint( + path=f"{PATH_VIDEO_EFFECTS}/{task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=KlingVideoEffectsResponse, + ), + result_url_extractor=get_video_url_from_response, + estimated_duration=AVERAGE_DURATION_VIDEO_EFFECTS, + node_id=node_id, + ) + validate_video_result_response(final_response) + + video = get_video_from_response(final_response) + return comfy_io.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) + + +async def execute_lipsync( + auth_kwargs: dict[str, str], + node_id: str, + video: VideoInput, + audio: Optional[AudioInput] = None, + voice_language: Optional[str] = None, + model_mode: Optional[str] = None, + text: Optional[str] = None, + voice_speed: Optional[float] = None, + voice_id: Optional[str] = None, +) -> comfy_io.NodeOutput: + if text: + validate_string(text, field_name="Text", max_length=MAX_PROMPT_LENGTH_LIP_SYNC) + validate_video_dimensions(video, 720, 1920) + validate_video_duration(video, 2, 10) + + # Upload video to Comfy API and get download URL + video_url = await upload_video_to_comfyapi(video, auth_kwargs=auth_kwargs) + logging.info("Uploaded video to Comfy API. URL: %s", video_url) + + # Upload the audio file to Comfy API and get download URL + if audio: + audio_url = await upload_audio_to_comfyapi(audio, auth_kwargs=auth_kwargs) + logging.info("Uploaded audio to Comfy API. URL: %s", audio_url) + else: + audio_url = None + + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path=PATH_LIP_SYNC, + method=HttpMethod.POST, + request_model=KlingLipSyncRequest, + response_model=KlingLipSyncResponse, + ), + request=KlingLipSyncRequest( + input=KlingLipSyncInputObject( + video_url=video_url, + mode=model_mode, + text=text, + voice_language=voice_language, + voice_speed=voice_speed, + audio_type="url", + audio_url=audio_url, + voice_id=voice_id, + ), + ), + auth_kwargs=auth_kwargs, + ) + + task_creation_response = await initial_operation.execute() + validate_task_creation_response(task_creation_response) + task_id = task_creation_response.data.task_id + + final_response = await poll_until_finished( + auth_kwargs, + ApiEndpoint( + path=f"{PATH_LIP_SYNC}/{task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=KlingLipSyncResponse, + ), + result_url_extractor=get_video_url_from_response, + estimated_duration=AVERAGE_DURATION_LIP_SYNC, + node_id=node_id, + ) + validate_video_result_response(final_response) + + video = get_video_from_response(final_response) + return comfy_io.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) + + +class KlingCameraControls(comfy_io.ComfyNode): """Kling Camera Controls Node""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "camera_control_type": model_field_to_node_input( - IO.COMBO, - KlingCameraControl, - "type", - enum_type=KlingCameraControlType, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="KlingCameraControls", + display_name="Kling Camera Controls", + category="api node/video/Kling", + description="Allows specifying configuration options for Kling Camera Controls and motion control effects.", + inputs=[ + comfy_io.Combo.Input("camera_control_type", options=[i.value for i in KlingCameraControlType]), + comfy_io.Float.Input( + "horizontal_movement", + default=0.0, + min=-10.0, + max=10.0, + step=0.25, + display_mode=comfy_io.NumberDisplay.slider, + tooltip="Controls camera's movement along horizontal axis (x-axis). Negative indicates left, positive indicates right", ), - "horizontal_movement": get_camera_control_input_config( - "Controls camera's movement along horizontal axis (x-axis). Negative indicates left, positive indicates right" + comfy_io.Float.Input( + "vertical_movement", + default=0.0, + min=-10.0, + max=10.0, + step=0.25, + display_mode=comfy_io.NumberDisplay.slider, + tooltip="Controls camera's movement along vertical axis (y-axis). Negative indicates downward, positive indicates upward.", ), - "vertical_movement": get_camera_control_input_config( - "Controls camera's movement along vertical axis (y-axis). Negative indicates downward, positive indicates upward." - ), - "pan": get_camera_control_input_config( - "Controls camera's rotation in vertical plane (x-axis). Negative indicates downward rotation, positive indicates upward rotation.", + comfy_io.Float.Input( + "pan", default=0.5, + min=-10.0, + max=10.0, + step=0.25, + display_mode=comfy_io.NumberDisplay.slider, + tooltip="Controls camera's rotation in vertical plane (x-axis). Negative indicates downward rotation, positive indicates upward rotation.", ), - "tilt": get_camera_control_input_config( - "Controls camera's rotation in horizontal plane (y-axis). Negative indicates left rotation, positive indicates right rotation.", + comfy_io.Float.Input( + "tilt", + default=0.0, + min=-10.0, + max=10.0, + step=0.25, + display_mode=comfy_io.NumberDisplay.slider, + tooltip="Controls camera's rotation in horizontal plane (y-axis). Negative indicates left rotation, positive indicates right rotation.", ), - "roll": get_camera_control_input_config( - "Controls camera's rolling amount (z-axis). Negative indicates counterclockwise, positive indicates clockwise.", + comfy_io.Float.Input( + "roll", + default=0.0, + min=-10.0, + max=10.0, + step=0.25, + display_mode=comfy_io.NumberDisplay.slider, + tooltip="Controls camera's rolling amount (z-axis). Negative indicates counterclockwise, positive indicates clockwise.", ), - "zoom": get_camera_control_input_config( - "Controls change in camera's focal length. Negative indicates narrower field of view, positive indicates wider field of view.", + comfy_io.Float.Input( + "zoom", + default=0.0, + min=-10.0, + max=10.0, + step=0.25, + display_mode=comfy_io.NumberDisplay.slider, + tooltip="Controls change in camera's focal length. Negative indicates narrower field of view, positive indicates wider field of view.", ), - } - } - - DESCRIPTION = "Allows specifying configuration options for Kling Camera Controls and motion control effects." - RETURN_TYPES = ("CAMERA_CONTROL",) - RETURN_NAMES = ("camera_control",) - FUNCTION = "main" - API_NODE = False # This is just a helper node, it doesn't make an API call + ], + outputs=[comfy_io.Custom("CAMERA_CONTROL").Output(display_name="camera_control")], + ) @classmethod - def VALIDATE_INPUTS( + def validate_inputs( cls, horizontal_movement: float, vertical_movement: float, @@ -374,8 +729,9 @@ class KlingCameraControls(KlingNodeBase): return "Invalid camera control configs: at least one of the values must be non-zero" return True - def main( - self, + @classmethod + def execute( + cls, camera_control_type: str, horizontal_movement: float, vertical_movement: float, @@ -383,8 +739,8 @@ class KlingCameraControls(KlingNodeBase): tilt: float, roll: float, zoom: float, - ) -> tuple[KlingCameraControl]: - return ( + ) -> comfy_io.NodeOutput: + return comfy_io.NodeOutput( KlingCameraControl( type=KlingCameraControlType(camera_control_type), config=KlingCameraConfig( @@ -395,303 +751,186 @@ class KlingCameraControls(KlingNodeBase): tilt=tilt, zoom=zoom, ), - ), + ) ) -class KlingTextToVideoNode(KlingNodeBase): +class KlingTextToVideoNode(comfy_io.ComfyNode): """Kling Text to Video Node""" - @staticmethod - def get_mode_string_mapping() -> dict[str, tuple[str, str, str]]: - """ - Returns a mapping of mode strings to their corresponding (mode, duration, model_name) tuples. - Only includes config combos that support the `image_tail` request field. - - See: [Kling API Docs Capability Map](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap) - """ - return { - "standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"), - "standard mode / 10s duration / kling-v1": ("std", "10", "kling-v1"), - "pro mode / 5s duration / kling-v1": ("pro", "5", "kling-v1"), - "pro mode / 10s duration / kling-v1": ("pro", "10", "kling-v1"), - "standard mode / 5s duration / kling-v1-6": ("std", "5", "kling-v1-6"), - "standard mode / 10s duration / kling-v1-6": ("std", "10", "kling-v1-6"), - "pro mode / 5s duration / kling-v2-master": ("pro", "5", "kling-v2-master"), - "pro mode / 10s duration / kling-v2-master": ("pro", "10", "kling-v2-master"), - "standard mode / 5s duration / kling-v2-master": ("std", "5", "kling-v2-master"), - "standard mode / 10s duration / kling-v2-master": ("std", "10", "kling-v2-master"), - "pro mode / 5s duration / kling-v2-1-master": ("pro", "5", "kling-v2-1-master"), - "pro mode / 10s duration / kling-v2-1-master": ("pro", "10", "kling-v2-1-master"), - "pro mode / 5s duration / kling-v2-5-turbo": ("pro", "5", "kling-v2-5-turbo"), - "pro mode / 10s duration / kling-v2-5-turbo": ("pro", "10", "kling-v2-5-turbo"), - } - @classmethod - def INPUT_TYPES(s): - modes = list(KlingTextToVideoNode.get_mode_string_mapping().keys()) - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, KlingText2VideoRequest, "prompt", multiline=True - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, KlingText2VideoRequest, "negative_prompt", multiline=True - ), - "cfg_scale": model_field_to_node_input( - IO.FLOAT, - KlingText2VideoRequest, - "cfg_scale", - default=1.0, - min=0.0, - max=1.0, - ), - "aspect_ratio": model_field_to_node_input( - IO.COMBO, - KlingText2VideoRequest, + def define_schema(cls) -> comfy_io.Schema: + modes = list(MODE_TEXT2VIDEO.keys()) + return comfy_io.Schema( + node_id="KlingTextToVideoNode", + display_name="Kling Text to Video", + category="api node/video/Kling", + description="Kling Text to Video Node", + inputs=[ + comfy_io.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + comfy_io.Float.Input("cfg_scale", default=1.0, min=0.0, max=1.0), + comfy_io.Combo.Input( "aspect_ratio", - enum_type=KlingVideoGenAspectRatio, + options=[i.value for i in KlingVideoGenAspectRatio], + default="16:9", ), - "mode": ( - modes, - { - "default": modes[4], - "tooltip": "The configuration to use for the video generation following the format: mode / duration / model_name.", - }, + comfy_io.Combo.Input( + "mode", + options=modes, + default=modes[4], + tooltip="The configuration to use for the video generation following the format: mode / duration / model_name.", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = ("VIDEO", "STRING", "STRING") - RETURN_NAMES = ("VIDEO", "video_id", "duration") - DESCRIPTION = "Kling Text to Video Node" - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> KlingText2VideoResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_TEXT_TO_VIDEO}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingText2VideoResponse, - ), - result_url_extractor=get_video_url_from_response, - estimated_duration=AVERAGE_DURATION_T2V, - node_id=node_id, + ], + outputs=[ + comfy_io.Video.Output(), + comfy_io.String.Output(display_name="video_id"), + comfy_io.String.Output(display_name="duration"), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, negative_prompt: str, cfg_scale: float, mode: str, aspect_ratio: str, - camera_control: Optional[KlingCameraControl] = None, - model_name: Optional[str] = None, - duration: Optional[str] = None, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile, str, str]: - validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V) - if model_name is None: - mode, duration, model_name = self.get_mode_string_mapping()[mode] - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_TEXT_TO_VIDEO, - method=HttpMethod.POST, - request_model=KlingText2VideoRequest, - response_model=KlingText2VideoResponse, - ), - request=KlingText2VideoRequest( - prompt=prompt if prompt else None, - negative_prompt=negative_prompt if negative_prompt else None, - duration=KlingVideoGenDuration(duration), - mode=KlingVideoGenMode(mode), - model_name=KlingVideoGenModelName(model_name), - cfg_scale=cfg_scale, - aspect_ratio=KlingVideoGenAspectRatio(aspect_ratio), - camera_control=camera_control, - ), - auth_kwargs=kwargs, + ) -> comfy_io.NodeOutput: + model_mode, duration, model_name = MODE_TEXT2VIDEO[mode] + return await execute_text2video( + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + node_id=cls.hidden.unique_id, + prompt=prompt, + negative_prompt=negative_prompt, + cfg_scale=cfg_scale, + model_mode=model_mode, + aspect_ratio=aspect_ratio, + model_name=model_name, + duration=duration, ) - task_creation_response = await initial_operation.execute() - validate_task_creation_response(task_creation_response) - task_id = task_creation_response.data.task_id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id - ) - validate_video_result_response(final_response) - - video = get_video_from_response(final_response) - return await video_result_to_node_output(video) - - -class KlingCameraControlT2VNode(KlingTextToVideoNode): +class KlingCameraControlT2VNode(comfy_io.ComfyNode): """ Kling Text to Video Camera Control Node. This node is a text to video node, but it supports controlling the camera. Duration, mode, and model_name request fields are hard-coded because camera control is only supported in pro mode with the kling-v1-5 model at 5s duration as of 2025-05-02. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, KlingText2VideoRequest, "prompt", multiline=True - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - KlingText2VideoRequest, - "negative_prompt", - multiline=True, - ), - "cfg_scale": model_field_to_node_input( - IO.FLOAT, - KlingText2VideoRequest, - "cfg_scale", - default=0.75, - min=0.0, - max=1.0, - ), - "aspect_ratio": model_field_to_node_input( - IO.COMBO, - KlingText2VideoRequest, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="KlingCameraControlT2VNode", + display_name="Kling Text to Video (Camera Control)", + category="api node/video/Kling", + description="Transform text into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original text.", + inputs=[ + comfy_io.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + comfy_io.Float.Input("cfg_scale", default=0.75, min=0.0, max=1.0), + comfy_io.Combo.Input( "aspect_ratio", - enum_type=KlingVideoGenAspectRatio, + options=[i.value for i in KlingVideoGenAspectRatio], + default="16:9", ), - "camera_control": ( - "CAMERA_CONTROL", - { - "tooltip": "Can be created using the Kling Camera Controls node. Controls the camera movement and motion during the video generation.", - }, + comfy_io.Custom("CAMERA_CONTROL").Input( + "camera_control", + tooltip="Can be created using the Kling Camera Controls node. Controls the camera movement and motion during the video generation.", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Video.Output(), + comfy_io.String.Output(display_name="video_id"), + comfy_io.String.Output(display_name="duration"), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Transform text into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original text." - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, negative_prompt: str, cfg_scale: float, aspect_ratio: str, camera_control: Optional[KlingCameraControl] = None, - unique_id: Optional[str] = None, - **kwargs, - ): - return await super().api_call( + ) -> comfy_io.NodeOutput: + return await execute_text2video( + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + node_id=cls.hidden.unique_id, model_name=KlingVideoGenModelName.kling_v1, cfg_scale=cfg_scale, - mode=KlingVideoGenMode.std, + model_mode=KlingVideoGenMode.std, aspect_ratio=KlingVideoGenAspectRatio(aspect_ratio), duration=KlingVideoGenDuration.field_5, prompt=prompt, negative_prompt=negative_prompt, camera_control=camera_control, - **kwargs, ) -class KlingImage2VideoNode(KlingNodeBase): +class KlingImage2VideoNode(comfy_io.ComfyNode): """Kling Image to Video Node""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "start_frame": model_field_to_node_input( - IO.IMAGE, - KlingImage2VideoRequest, - "image", - tooltip="The reference image used to generate the video.", - ), - "prompt": model_field_to_node_input( - IO.STRING, KlingImage2VideoRequest, "prompt", multiline=True - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - KlingImage2VideoRequest, - "negative_prompt", - multiline=True, - ), - "model_name": model_field_to_node_input( - IO.COMBO, - KlingImage2VideoRequest, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="KlingImage2VideoNode", + display_name="Kling Image to Video", + category="api node/video/Kling", + description="Kling Image to Video Node", + inputs=[ + comfy_io.Image.Input("start_frame", tooltip="The reference image used to generate the video."), + comfy_io.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + comfy_io.Combo.Input( "model_name", - enum_type=KlingVideoGenModelName, + options=[i.value for i in KlingVideoGenModelName], + default="kling-v2-master", ), - "cfg_scale": model_field_to_node_input( - IO.FLOAT, - KlingImage2VideoRequest, - "cfg_scale", - default=0.8, - min=0.0, - max=1.0, - ), - "mode": model_field_to_node_input( - IO.COMBO, - KlingImage2VideoRequest, - "mode", - enum_type=KlingVideoGenMode, - ), - "aspect_ratio": model_field_to_node_input( - IO.COMBO, - KlingImage2VideoRequest, + comfy_io.Float.Input("cfg_scale", default=0.8, min=0.0, max=1.0), + comfy_io.Combo.Input("mode", options=[i.value for i in KlingVideoGenMode], default="std"), + comfy_io.Combo.Input( "aspect_ratio", - enum_type=KlingVideoGenAspectRatio, + options=[i.value for i in KlingVideoGenAspectRatio], + default="16:9", ), - "duration": model_field_to_node_input( - IO.COMBO, - KlingImage2VideoRequest, - "duration", - enum_type=KlingVideoGenDuration, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = ("VIDEO", "STRING", "STRING") - RETURN_NAMES = ("VIDEO", "video_id", "duration") - DESCRIPTION = "Kling Image to Video Node" - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> KlingImage2VideoResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}", - method=HttpMethod.GET, - request_model=KlingImage2VideoRequest, - response_model=KlingImage2VideoResponse, - ), - result_url_extractor=get_video_url_from_response, - estimated_duration=AVERAGE_DURATION_I2V, - node_id=node_id, + comfy_io.Combo.Input("duration", options=[i.value for i in KlingVideoGenDuration], default="5"), + ], + outputs=[ + comfy_io.Video.Output(), + comfy_io.String.Output(display_name="video_id"), + comfy_io.String.Output(display_name="duration"), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) - async def api_call( - self, + @classmethod + async def execute( + cls, start_frame: torch.Tensor, prompt: str, negative_prompt: str, @@ -702,212 +941,151 @@ class KlingImage2VideoNode(KlingNodeBase): duration: str, camera_control: Optional[KlingCameraControl] = None, end_frame: Optional[torch.Tensor] = None, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile]: - validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_I2V) - validate_input_image(start_frame) - - if camera_control is not None: - # Camera control type for image 2 video is always `simple` - camera_control.type = KlingCameraControlType.simple - - if mode == "std" and model_name == KlingVideoGenModelName.kling_v2_5_turbo.value: - mode = "pro" # October 5: currently "std" mode is not supported for this model - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_IMAGE_TO_VIDEO, - method=HttpMethod.POST, - request_model=KlingImage2VideoRequest, - response_model=KlingImage2VideoResponse, - ), - request=KlingImage2VideoRequest( - model_name=KlingVideoGenModelName(model_name), - image=tensor_to_base64_string(start_frame), - image_tail=( - tensor_to_base64_string(end_frame) - if end_frame is not None - else None - ), - prompt=prompt, - negative_prompt=negative_prompt if negative_prompt else None, - cfg_scale=cfg_scale, - mode=KlingVideoGenMode(mode), - duration=KlingVideoGenDuration(duration), - camera_control=camera_control, - ), - auth_kwargs=kwargs, + ) -> comfy_io.NodeOutput: + return await execute_image2video( + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + node_id=cls.hidden.unique_id, + start_frame=start_frame, + prompt=prompt, + negative_prompt=negative_prompt, + cfg_scale=cfg_scale, + model_name=model_name, + aspect_ratio=aspect_ratio, + model_mode=mode, + duration=duration, + camera_control=camera_control, + end_frame=end_frame, ) - task_creation_response = await initial_operation.execute() - validate_task_creation_response(task_creation_response) - task_id = task_creation_response.data.task_id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id - ) - validate_video_result_response(final_response) - - video = get_video_from_response(final_response) - return await video_result_to_node_output(video) - - -class KlingCameraControlI2VNode(KlingImage2VideoNode): +class KlingCameraControlI2VNode(comfy_io.ComfyNode): """ Kling Image to Video Camera Control Node. This node is a image to video node, but it supports controlling the camera. Duration, mode, and model_name request fields are hard-coded because camera control is only supported in pro mode with the kling-v1-5 model at 5s duration as of 2025-05-02. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "start_frame": model_field_to_node_input( - IO.IMAGE, KlingImage2VideoRequest, "image" + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="KlingCameraControlI2VNode", + display_name="Kling Image to Video (Camera Control)", + category="api node/video/Kling", + description="Transform still images into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original image.", + inputs=[ + comfy_io.Image.Input( + "start_frame", + tooltip="Reference Image - URL or Base64 encoded string, cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1. Base64 should not include data:image prefix.", ), - "prompt": model_field_to_node_input( - IO.STRING, KlingImage2VideoRequest, "prompt", multiline=True - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - KlingImage2VideoRequest, - "negative_prompt", - multiline=True, - ), - "cfg_scale": model_field_to_node_input( - IO.FLOAT, - KlingImage2VideoRequest, - "cfg_scale", - default=0.75, - min=0.0, - max=1.0, - ), - "aspect_ratio": model_field_to_node_input( - IO.COMBO, - KlingImage2VideoRequest, + comfy_io.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + comfy_io.Float.Input("cfg_scale", default=0.75, min=0.0, max=1.0), + comfy_io.Combo.Input( "aspect_ratio", - enum_type=KlingVideoGenAspectRatio, + options=[i.value for i in KlingVideoGenAspectRatio], + default="16:9", ), - "camera_control": ( - "CAMERA_CONTROL", - { - "tooltip": "Can be created using the Kling Camera Controls node. Controls the camera movement and motion during the video generation.", - }, + comfy_io.Custom("CAMERA_CONTROL").Input( + "camera_control", + tooltip="Can be created using the Kling Camera Controls node. Controls the camera movement and motion during the video generation.", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Video.Output(), + comfy_io.String.Output(display_name="video_id"), + comfy_io.String.Output(display_name="duration"), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Transform still images into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original image." - - async def api_call( - self, + @classmethod + async def execute( + cls, start_frame: torch.Tensor, prompt: str, negative_prompt: str, cfg_scale: float, aspect_ratio: str, camera_control: KlingCameraControl, - unique_id: Optional[str] = None, - **kwargs, - ): - return await super().api_call( + ) -> comfy_io.NodeOutput: + return await execute_image2video( + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + node_id=cls.hidden.unique_id, model_name=KlingVideoGenModelName.kling_v1_5, start_frame=start_frame, cfg_scale=cfg_scale, - mode=KlingVideoGenMode.pro, + model_mode=KlingVideoGenMode.pro, aspect_ratio=KlingVideoGenAspectRatio(aspect_ratio), duration=KlingVideoGenDuration.field_5, prompt=prompt, negative_prompt=negative_prompt, camera_control=camera_control, - unique_id=unique_id, - **kwargs, ) -class KlingStartEndFrameNode(KlingImage2VideoNode): +class KlingStartEndFrameNode(comfy_io.ComfyNode): """ Kling First Last Frame Node. This node allows creation of a video from a first and last frame. It calls the normal image to video endpoint, but only allows the subset of input options that support the `image_tail` request field. """ - @staticmethod - def get_mode_string_mapping() -> dict[str, tuple[str, str, str]]: - """ - Returns a mapping of mode strings to their corresponding (mode, duration, model_name) tuples. - Only includes config combos that support the `image_tail` request field. - - See: [Kling API Docs Capability Map](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap) - """ - return { - "standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"), - "pro mode / 5s duration / kling-v1": ("pro", "5", "kling-v1"), - "pro mode / 5s duration / kling-v1-5": ("pro", "5", "kling-v1-5"), - "pro mode / 10s duration / kling-v1-5": ("pro", "10", "kling-v1-5"), - "pro mode / 5s duration / kling-v1-6": ("pro", "5", "kling-v1-6"), - "pro mode / 10s duration / kling-v1-6": ("pro", "10", "kling-v1-6"), - "pro mode / 5s duration / kling-v2-1": ("pro", "5", "kling-v2-1"), - "pro mode / 10s duration / kling-v2-1": ("pro", "10", "kling-v2-1"), - } + @classmethod + def define_schema(cls) -> comfy_io.Schema: + modes = list(MODE_START_END_FRAME.keys()) + return comfy_io.Schema( + node_id="KlingStartEndFrameNode", + display_name="Kling Start-End Frame to Video", + category="api node/video/Kling", + description="Generate a video sequence that transitions between your provided start and end images. The node creates all frames in between, producing a smooth transformation from the first frame to the last.", + inputs=[ + comfy_io.Image.Input( + "start_frame", + tooltip="Reference Image - URL or Base64 encoded string, cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1. Base64 should not include data:image prefix.", + ), + comfy_io.Image.Input( + "end_frame", + tooltip="Reference Image - End frame control. URL or Base64 encoded string, cannot exceed 10MB, resolution not less than 300*300px. Base64 should not include data:image prefix.", + ), + comfy_io.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + comfy_io.Float.Input("cfg_scale", default=0.5, min=0.0, max=1.0), + comfy_io.Combo.Input( + "aspect_ratio", + options=[i.value for i in KlingVideoGenAspectRatio], + default="16:9", + ), + comfy_io.Combo.Input( + "mode", + options=modes, + default=modes[2], + tooltip="The configuration to use for the video generation following the format: mode / duration / model_name.", + ), + ], + outputs=[ + comfy_io.Video.Output(), + comfy_io.String.Output(display_name="video_id"), + comfy_io.String.Output(display_name="duration"), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - modes = list(KlingStartEndFrameNode.get_mode_string_mapping().keys()) - return { - "required": { - "start_frame": model_field_to_node_input( - IO.IMAGE, KlingImage2VideoRequest, "image" - ), - "end_frame": model_field_to_node_input( - IO.IMAGE, KlingImage2VideoRequest, "image_tail" - ), - "prompt": model_field_to_node_input( - IO.STRING, KlingImage2VideoRequest, "prompt", multiline=True - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - KlingImage2VideoRequest, - "negative_prompt", - multiline=True, - ), - "cfg_scale": model_field_to_node_input( - IO.FLOAT, - KlingImage2VideoRequest, - "cfg_scale", - default=0.5, - min=0.0, - max=1.0, - ), - "aspect_ratio": model_field_to_node_input( - IO.COMBO, - KlingImage2VideoRequest, - "aspect_ratio", - enum_type=KlingVideoGenAspectRatio, - ), - "mode": ( - modes, - { - "default": modes[2], - "tooltip": "The configuration to use for the video generation following the format: mode / duration / model_name.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - DESCRIPTION = "Generate a video sequence that transitions between your provided start and end images. The node creates all frames in between, producing a smooth transformation from the first frame to the last." - - async def api_call( - self, + async def execute( + cls, start_frame: torch.Tensor, end_frame: torch.Tensor, prompt: str, @@ -915,90 +1093,78 @@ class KlingStartEndFrameNode(KlingImage2VideoNode): cfg_scale: float, aspect_ratio: str, mode: str, - unique_id: Optional[str] = None, - **kwargs, - ): - mode, duration, model_name = KlingStartEndFrameNode.get_mode_string_mapping()[ - mode - ] - return await super().api_call( + ) -> comfy_io.NodeOutput: + mode, duration, model_name = MODE_START_END_FRAME[mode] + return await execute_image2video( + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + node_id=cls.hidden.unique_id, prompt=prompt, negative_prompt=negative_prompt, model_name=model_name, start_frame=start_frame, cfg_scale=cfg_scale, - mode=mode, + model_mode=mode, aspect_ratio=aspect_ratio, duration=duration, end_frame=end_frame, - unique_id=unique_id, - **kwargs, ) -class KlingVideoExtendNode(KlingNodeBase): +class KlingVideoExtendNode(comfy_io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, KlingVideoExtendRequest, "prompt", multiline=True + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="KlingVideoExtendNode", + display_name="Kling Video Extend", + category="api node/video/Kling", + description="Kling Video Extend Node. Extend videos made by other Kling nodes. The video_id is created by using other Kling Nodes.", + inputs=[ + comfy_io.String.Input( + "prompt", + multiline=True, + tooltip="Positive text prompt for guiding the video extension", ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - KlingVideoExtendRequest, + comfy_io.String.Input( "negative_prompt", multiline=True, + tooltip="Negative text prompt for elements to avoid in the extended video", ), - "cfg_scale": model_field_to_node_input( - IO.FLOAT, - KlingVideoExtendRequest, - "cfg_scale", - default=0.5, - min=0.0, - max=1.0, + comfy_io.Float.Input("cfg_scale", default=0.5, min=0.0, max=1.0), + comfy_io.String.Input( + "video_id", + force_input=True, + tooltip="The ID of the video to be extended. Supports videos generated by text-to-video, image-to-video, and previous video extension operations. Cannot exceed 3 minutes total duration after extension.", ), - "video_id": model_field_to_node_input( - IO.STRING, KlingVideoExtendRequest, "video_id", forceInput=True - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = ("VIDEO", "STRING", "STRING") - RETURN_NAMES = ("VIDEO", "video_id", "duration") - DESCRIPTION = "Kling Video Extend Node. Extend videos made by other Kling nodes. The video_id is created by using other Kling Nodes." - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> KlingVideoExtendResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_VIDEO_EXTEND}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingVideoExtendResponse, - ), - result_url_extractor=get_video_url_from_response, - estimated_duration=AVERAGE_DURATION_VIDEO_EXTEND, - node_id=node_id, + ], + outputs=[ + comfy_io.Video.Output(), + comfy_io.String.Output(display_name="video_id"), + comfy_io.String.Output(display_name="duration"), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, negative_prompt: str, cfg_scale: float, video_id: str, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile, str, str]: + ) -> comfy_io.NodeOutput: validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_VIDEO_EXTEND, @@ -1012,560 +1178,323 @@ class KlingVideoExtendNode(KlingNodeBase): cfg_scale=cfg_scale, video_id=video_id, ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id - ) - validate_video_result_response(final_response) - - video = get_video_from_response(final_response) - return await video_result_to_node_output(video) - - -class KlingVideoEffectsBase(KlingNodeBase): - """Kling Video Effects Base""" - - RETURN_TYPES = ("VIDEO", "STRING", "STRING") - RETURN_NAMES = ("VIDEO", "video_id", "duration") - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> KlingVideoEffectsResponse: - return await poll_until_finished( - auth_kwargs, + final_response = await poll_until_finished( + auth, ApiEndpoint( - path=f"{PATH_VIDEO_EFFECTS}/{task_id}", + path=f"{PATH_VIDEO_EXTEND}/{task_id}", method=HttpMethod.GET, request_model=EmptyRequest, - response_model=KlingVideoEffectsResponse, + response_model=KlingVideoExtendResponse, ), result_url_extractor=get_video_url_from_response, - estimated_duration=AVERAGE_DURATION_VIDEO_EFFECTS, - node_id=node_id, - ) - - async def api_call( - self, - dual_character: bool, - effect_scene: KlingDualCharacterEffectsScene | KlingSingleImageEffectsScene, - model_name: str, - duration: KlingVideoGenDuration, - image_1: torch.Tensor, - image_2: Optional[torch.Tensor] = None, - mode: Optional[KlingVideoGenMode] = None, - unique_id: Optional[str] = None, - **kwargs, - ): - if dual_character: - request_input_field = KlingDualCharacterEffectInput( - model_name=model_name, - mode=mode, - images=[ - tensor_to_base64_string(image_1), - tensor_to_base64_string(image_2), - ], - duration=duration, - ) - else: - request_input_field = KlingSingleImageEffectInput( - model_name=model_name, - image=tensor_to_base64_string(image_1), - duration=duration, - ) - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_VIDEO_EFFECTS, - method=HttpMethod.POST, - request_model=KlingVideoEffectsRequest, - response_model=KlingVideoEffectsResponse, - ), - request=KlingVideoEffectsRequest( - effect_scene=effect_scene, - input=request_input_field, - ), - auth_kwargs=kwargs, - ) - - task_creation_response = await initial_operation.execute() - validate_task_creation_response(task_creation_response) - task_id = task_creation_response.data.task_id - - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id + estimated_duration=AVERAGE_DURATION_VIDEO_EXTEND, + node_id=cls.hidden.unique_id, ) validate_video_result_response(final_response) video = get_video_from_response(final_response) - return await video_result_to_node_output(video) + return comfy_io.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) -class KlingDualCharacterVideoEffectNode(KlingVideoEffectsBase): +class KlingDualCharacterVideoEffectNode(comfy_io.ComfyNode): """Kling Dual Character Video Effect Node""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image_left": (IO.IMAGE, {"tooltip": "Left side image"}), - "image_right": (IO.IMAGE, {"tooltip": "Right side image"}), - "effect_scene": model_field_to_node_input( - IO.COMBO, - KlingVideoEffectsRequest, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="KlingDualCharacterVideoEffectNode", + display_name="Kling Dual Character Video Effects", + category="api node/video/Kling", + description="Achieve different special effects when generating a video based on the effect_scene. First image will be positioned on left side, second on right side of the composite.", + inputs=[ + comfy_io.Image.Input("image_left", tooltip="Left side image"), + comfy_io.Image.Input("image_right", tooltip="Right side image"), + comfy_io.Combo.Input( "effect_scene", - enum_type=KlingDualCharacterEffectsScene, + options=[i.value for i in KlingDualCharacterEffectsScene], ), - "model_name": model_field_to_node_input( - IO.COMBO, - KlingDualCharacterEffectInput, + comfy_io.Combo.Input( "model_name", - enum_type=KlingCharacterEffectModelName, + options=[i.value for i in KlingCharacterEffectModelName], + default="kling-v1", ), - "mode": model_field_to_node_input( - IO.COMBO, - KlingDualCharacterEffectInput, + comfy_io.Combo.Input( "mode", - enum_type=KlingVideoGenMode, + options=[i.value for i in KlingVideoGenMode], + default="std", ), - "duration": model_field_to_node_input( - IO.COMBO, - KlingDualCharacterEffectInput, + comfy_io.Combo.Input( "duration", - enum_type=KlingVideoGenDuration, + options=[i.value for i in KlingVideoGenDuration], ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Video.Output(), + comfy_io.String.Output(display_name="duration"), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Achieve different special effects when generating a video based on the effect_scene. First image will be positioned on left side, second on right side of the composite." - RETURN_TYPES = ("VIDEO", "STRING") - RETURN_NAMES = ("VIDEO", "duration") - - async def api_call( - self, + @classmethod + async def execute( + cls, image_left: torch.Tensor, image_right: torch.Tensor, effect_scene: KlingDualCharacterEffectsScene, model_name: KlingCharacterEffectModelName, mode: KlingVideoGenMode, duration: KlingVideoGenDuration, - unique_id: Optional[str] = None, - **kwargs, - ): - video, _, duration = await super().api_call( + ) -> comfy_io.NodeOutput: + video, _, duration = await execute_video_effect( + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + node_id=cls.hidden.unique_id, dual_character=True, effect_scene=effect_scene, model_name=model_name, - mode=mode, + model_mode=mode, duration=duration, image_1=image_left, image_2=image_right, - unique_id=unique_id, - **kwargs, ) return video, duration -class KlingSingleImageVideoEffectNode(KlingVideoEffectsBase): +class KlingSingleImageVideoEffectNode(comfy_io.ComfyNode): """Kling Single Image Video Effect Node""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ( - IO.IMAGE, - { - "tooltip": " Reference Image. URL or Base64 encoded string (without data:image prefix). File size cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1" - }, - ), - "effect_scene": model_field_to_node_input( - IO.COMBO, - KlingVideoEffectsRequest, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="KlingSingleImageVideoEffectNode", + display_name="Kling Video Effects", + category="api node/video/Kling", + description="Achieve different special effects when generating a video based on the effect_scene.", + inputs=[ + comfy_io.Image.Input("image", tooltip=" Reference Image. URL or Base64 encoded string (without data:image prefix). File size cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1"), + comfy_io.Combo.Input( "effect_scene", - enum_type=KlingSingleImageEffectsScene, + options=[i.value for i in KlingSingleImageEffectsScene], ), - "model_name": model_field_to_node_input( - IO.COMBO, - KlingSingleImageEffectInput, + comfy_io.Combo.Input( "model_name", - enum_type=KlingSingleImageEffectModelName, + options=[i.value for i in KlingSingleImageEffectModelName], ), - "duration": model_field_to_node_input( - IO.COMBO, - KlingSingleImageEffectInput, + comfy_io.Combo.Input( "duration", - enum_type=KlingVideoGenDuration, + options=[i.value for i in KlingVideoGenDuration], ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Video.Output(), + comfy_io.String.Output(display_name="video_id"), + comfy_io.String.Output(display_name="duration"), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Achieve different special effects when generating a video based on the effect_scene." - - async def api_call( - self, + @classmethod + async def execute( + cls, image: torch.Tensor, effect_scene: KlingSingleImageEffectsScene, model_name: KlingSingleImageEffectModelName, duration: KlingVideoGenDuration, - unique_id: Optional[str] = None, - **kwargs, - ): - return await super().api_call( + ) -> comfy_io.NodeOutput: + return await execute_video_effect( + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + node_id=cls.hidden.unique_id, dual_character=False, effect_scene=effect_scene, model_name=model_name, duration=duration, image_1=image, - unique_id=unique_id, - **kwargs, ) -class KlingLipSyncBase(KlingNodeBase): - """Kling Lip Sync Base""" - - RETURN_TYPES = ("VIDEO", "STRING", "STRING") - RETURN_NAMES = ("VIDEO", "video_id", "duration") - - def validate_lip_sync_video(self, video: VideoInput): - """ - Validates the input video adheres to the expectations of the Kling Lip Sync API: - - Video length does not exceed 10s and is not shorter than 2s - - Length and width dimensions should both be between 720px and 1920px - - See: https://app.klingai.com/global/dev/document-api/apiReference/model/videoTolip - """ - validate_video_dimensions(video, 720, 1920) - validate_video_duration(video, 2, 10) - - def validate_text(self, text: str): - if not text: - raise ValueError("Text is required") - if len(text) > MAX_PROMPT_LENGTH_LIP_SYNC: - raise ValueError( - f"Text is too long. Maximum length is {MAX_PROMPT_LENGTH_LIP_SYNC} characters." - ) - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> KlingLipSyncResponse: - """Polls the Kling API endpoint until the task reaches a terminal state.""" - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_LIP_SYNC}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingLipSyncResponse, - ), - result_url_extractor=get_video_url_from_response, - estimated_duration=AVERAGE_DURATION_LIP_SYNC, - node_id=node_id, - ) - - async def api_call( - self, - video: VideoInput, - audio: Optional[AudioInput] = None, - voice_language: Optional[str] = None, - mode: Optional[str] = None, - text: Optional[str] = None, - voice_speed: Optional[float] = None, - voice_id: Optional[str] = None, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile, str, str]: - if text: - self.validate_text(text) - self.validate_lip_sync_video(video) - - # Upload video to Comfy API and get download URL - video_url = await upload_video_to_comfyapi(video, auth_kwargs=kwargs) - logging.info("Uploaded video to Comfy API. URL: %s", video_url) - - # Upload the audio file to Comfy API and get download URL - if audio: - audio_url = await upload_audio_to_comfyapi(audio, auth_kwargs=kwargs) - logging.info("Uploaded audio to Comfy API. URL: %s", audio_url) - else: - audio_url = None - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_LIP_SYNC, - method=HttpMethod.POST, - request_model=KlingLipSyncRequest, - response_model=KlingLipSyncResponse, - ), - request=KlingLipSyncRequest( - input=KlingLipSyncInputObject( - video_url=video_url, - mode=mode, - text=text, - voice_language=voice_language, - voice_speed=voice_speed, - audio_type="url", - audio_url=audio_url, - voice_id=voice_id, - ), - ), - auth_kwargs=kwargs, - ) - - task_creation_response = await initial_operation.execute() - validate_task_creation_response(task_creation_response) - task_id = task_creation_response.data.task_id - - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id - ) - validate_video_result_response(final_response) - - video = get_video_from_response(final_response) - return await video_result_to_node_output(video) - - -class KlingLipSyncAudioToVideoNode(KlingLipSyncBase): +class KlingLipSyncAudioToVideoNode(comfy_io.ComfyNode): """Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file.""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "video": (IO.VIDEO, {}), - "audio": (IO.AUDIO, {}), - "voice_language": model_field_to_node_input( - IO.COMBO, - KlingLipSyncInputObject, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="KlingLipSyncAudioToVideoNode", + display_name="Kling Lip Sync Video with Audio", + category="api node/video/Kling", + description="Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file. When using, ensure that the audio contains clearly distinguishable vocals and that the video contains a distinct face. The audio file should not be larger than 5MB. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.", + inputs=[ + comfy_io.Video.Input("video"), + comfy_io.Audio.Input("audio"), + comfy_io.Combo.Input( "voice_language", - enum_type=KlingLipSyncVoiceLanguage, + options=[i.value for i in KlingLipSyncVoiceLanguage], + default="en", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + comfy_io.Video.Output(), + comfy_io.String.Output(display_name="video_id"), + comfy_io.String.Output(display_name="duration"), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file. When using, ensure that the audio contains clearly distinguishable vocals and that the video contains a distinct face. The audio file should not be larger than 5MB. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length." - - async def api_call( - self, + @classmethod + async def execute( + cls, video: VideoInput, audio: AudioInput, voice_language: str, - unique_id: Optional[str] = None, - **kwargs, - ): - return await super().api_call( + ) -> comfy_io.NodeOutput: + return await execute_lipsync( + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + node_id=cls.hidden.unique_id, video=video, audio=audio, voice_language=voice_language, - mode="audio2video", - unique_id=unique_id, - **kwargs, + model_mode="audio2video", ) -class KlingLipSyncTextToVideoNode(KlingLipSyncBase): +class KlingLipSyncTextToVideoNode(comfy_io.ComfyNode): """Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt.""" - @staticmethod - def get_voice_config() -> dict[str, tuple[str, str]]: - return { - # English voices - "Melody": ("girlfriend_4_speech02", "en"), - "Sunny": ("genshin_vindi2", "en"), - "Sage": ("zhinen_xuesheng", "en"), - "Ace": ("AOT", "en"), - "Blossom": ("ai_shatang", "en"), - "Peppy": ("genshin_klee2", "en"), - "Dove": ("genshin_kirara", "en"), - "Shine": ("ai_kaiya", "en"), - "Anchor": ("oversea_male1", "en"), - "Lyric": ("ai_chenjiahao_712", "en"), - "Tender": ("chat1_female_new-3", "en"), - "Siren": ("chat_0407_5-1", "en"), - "Zippy": ("cartoon-boy-07", "en"), - "Bud": ("uk_boy1", "en"), - "Sprite": ("cartoon-girl-01", "en"), - "Candy": ("PeppaPig_platform", "en"), - "Beacon": ("ai_huangzhong_712", "en"), - "Rock": ("ai_huangyaoshi_712", "en"), - "Titan": ("ai_laoguowang_712", "en"), - "Grace": ("chengshu_jiejie", "en"), - "Helen": ("you_pingjing", "en"), - "Lore": ("calm_story1", "en"), - "Crag": ("uk_man2", "en"), - "Prattle": ("laopopo_speech02", "en"), - "Hearth": ("heainainai_speech02", "en"), - "The Reader": ("reader_en_m-v1", "en"), - "Commercial Lady": ("commercial_lady_en_f-v1", "en"), - # Chinese voices - "阳光少年": ("genshin_vindi2", "zh"), - "懂事小弟": ("zhinen_xuesheng", "zh"), - "运动少年": ("tiyuxi_xuedi", "zh"), - "青春少女": ("ai_shatang", "zh"), - "温柔小妹": ("genshin_klee2", "zh"), - "元气少女": ("genshin_kirara", "zh"), - "阳光男生": ("ai_kaiya", "zh"), - "幽默小哥": ("tiexin_nanyou", "zh"), - "文艺小哥": ("ai_chenjiahao_712", "zh"), - "甜美邻家": ("girlfriend_1_speech02", "zh"), - "温柔姐姐": ("chat1_female_new-3", "zh"), - "职场女青": ("girlfriend_2_speech02", "zh"), - "活泼男童": ("cartoon-boy-07", "zh"), - "俏皮女童": ("cartoon-girl-01", "zh"), - "稳重老爸": ("ai_huangyaoshi_712", "zh"), - "温柔妈妈": ("you_pingjing", "zh"), - "严肃上司": ("ai_laoguowang_712", "zh"), - "优雅贵妇": ("chengshu_jiejie", "zh"), - "慈祥爷爷": ("zhuxi_speech02", "zh"), - "唠叨爷爷": ("uk_oldman3", "zh"), - "唠叨奶奶": ("laopopo_speech02", "zh"), - "和蔼奶奶": ("heainainai_speech02", "zh"), - "东北老铁": ("dongbeilaotie_speech02", "zh"), - "重庆小伙": ("chongqingxiaohuo_speech02", "zh"), - "四川妹子": ("chuanmeizi_speech02", "zh"), - "潮汕大叔": ("chaoshandashu_speech02", "zh"), - "台湾男生": ("ai_taiwan_man2_speech02", "zh"), - "西安掌柜": ("xianzhanggui_speech02", "zh"), - "天津姐姐": ("tianjinjiejie_speech02", "zh"), - "新闻播报男": ("diyinnansang_DB_CN_M_04-v2", "zh"), - "译制片男": ("yizhipiannan-v1", "zh"), - "撒娇女友": ("tianmeixuemei-v1", "zh"), - "刀片烟嗓": ("daopianyansang-v1", "zh"), - "乖巧正太": ("mengwa-v1", "zh"), - } + @classmethod + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="KlingLipSyncTextToVideoNode", + display_name="Kling Lip Sync Video with Text", + category="api node/video/Kling", + description="Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.", + inputs=[ + comfy_io.Video.Input("video"), + comfy_io.String.Input( + "text", + multiline=True, + tooltip="Text Content for Lip-Sync Video Generation. Required when mode is text2video. Maximum length is 120 characters.", + ), + comfy_io.Combo.Input( + "voice", + options=list(VOICES_CONFIG.keys()), + default="Melody", + ), + comfy_io.Float.Input( + "voice_speed", + default=1, + min=0.8, + max=2.0, + display_mode=comfy_io.NumberDisplay.slider, + tooltip="Speech Rate. Valid range: 0.8~2.0, accurate to one decimal place.", + ), + ], + outputs=[ + comfy_io.Video.Output(), + comfy_io.String.Output(display_name="video_id"), + comfy_io.String.Output(display_name="duration"), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - voice_options = list(s.get_voice_config().keys()) - return { - "required": { - "video": (IO.VIDEO, {}), - "text": model_field_to_node_input( - IO.STRING, KlingLipSyncInputObject, "text", multiline=True - ), - "voice": (voice_options, {"default": voice_options[0]}), - "voice_speed": model_field_to_node_input( - IO.FLOAT, KlingLipSyncInputObject, "voice_speed", slider=True - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - DESCRIPTION = "Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length." - - async def api_call( - self, + async def execute( + cls, video: VideoInput, text: str, voice: str, voice_speed: float, - unique_id: Optional[str] = None, - **kwargs, - ): - voice_id, voice_language = KlingLipSyncTextToVideoNode.get_voice_config()[voice] - return await super().api_call( + ) -> comfy_io.NodeOutput: + voice_id, voice_language = VOICES_CONFIG[voice] + return await execute_lipsync( + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + node_id=cls.hidden.unique_id, video=video, text=text, voice_language=voice_language, voice_id=voice_id, voice_speed=voice_speed, - mode="text2video", - unique_id=unique_id, - **kwargs, + model_mode="text2video", ) -class KlingImageGenerationBase(KlingNodeBase): - """Kling Image Generation Base Node.""" - - RETURN_TYPES = ("IMAGE",) - CATEGORY = "api node/image/Kling" - - def validate_prompt(self, prompt: str, negative_prompt: Optional[str] = None): - if not prompt or len(prompt) > MAX_PROMPT_LENGTH_IMAGE_GEN: - raise ValueError( - f"Prompt must be less than {MAX_PROMPT_LENGTH_IMAGE_GEN} characters" - ) - if negative_prompt and len(negative_prompt) > MAX_PROMPT_LENGTH_IMAGE_GEN: - raise ValueError( - f"Negative prompt must be less than {MAX_PROMPT_LENGTH_IMAGE_GEN} characters" - ) - - -class KlingVirtualTryOnNode(KlingImageGenerationBase): +class KlingVirtualTryOnNode(comfy_io.ComfyNode): """Kling Virtual Try On Node.""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "human_image": (IO.IMAGE, {}), - "cloth_image": (IO.IMAGE, {}), - "model_name": model_field_to_node_input( - IO.COMBO, - KlingVirtualTryOnRequest, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="KlingVirtualTryOnNode", + display_name="Kling Virtual Try On", + category="api node/image/Kling", + description="Kling Virtual Try On Node. Input a human image and a cloth image to try on the cloth on the human. You can merge multiple clothing item pictures into one image with a white background.", + inputs=[ + comfy_io.Image.Input("human_image"), + comfy_io.Image.Input("cloth_image"), + comfy_io.Combo.Input( "model_name", - enum_type=KlingVirtualTryOnModelName, + options=[i.value for i in KlingVirtualTryOnModelName], + default="kolors-virtual-try-on-v1", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - DESCRIPTION = "Kling Virtual Try On Node. Input a human image and a cloth image to try on the cloth on the human. You can merge multiple clothing item pictures into one image with a white background." - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> KlingVirtualTryOnResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_VIRTUAL_TRY_ON}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingVirtualTryOnResponse, - ), - result_url_extractor=get_images_urls_from_response, - estimated_duration=AVERAGE_DURATION_VIRTUAL_TRY_ON, - node_id=node_id, + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) - async def api_call( - self, + @classmethod + async def execute( + cls, human_image: torch.Tensor, cloth_image: torch.Tensor, model_name: KlingVirtualTryOnModelName, - unique_id: Optional[str] = None, - **kwargs, - ): + ) -> comfy_io.NodeOutput: + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_VIRTUAL_TRY_ON, @@ -1578,113 +1507,99 @@ class KlingVirtualTryOnNode(KlingImageGenerationBase): cloth_image=tensor_to_base64_string(cloth_image), model_name=model_name, ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id + final_response = await poll_until_finished( + auth, + ApiEndpoint( + path=f"{PATH_VIRTUAL_TRY_ON}/{task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=KlingVirtualTryOnResponse, + ), + result_url_extractor=get_images_urls_from_response, + estimated_duration=AVERAGE_DURATION_VIRTUAL_TRY_ON, + node_id=cls.hidden.unique_id, ) validate_image_result_response(final_response) images = get_images_from_response(final_response) - return (await image_result_to_node_output(images),) + return comfy_io.NodeOutput(await image_result_to_node_output(images)) -class KlingImageGenerationNode(KlingImageGenerationBase): +class KlingImageGenerationNode(comfy_io.ComfyNode): """Kling Image Generation Node. Generate an image from a text prompt with an optional reference image.""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, - KlingImageGenerationsRequest, - "prompt", - multiline=True, - max_length=MAX_PROMPT_LENGTH_IMAGE_GEN, + def define_schema(cls) -> comfy_io.Schema: + return comfy_io.Schema( + node_id="KlingImageGenerationNode", + display_name="Kling Image Generation", + category="api node/image/Kling", + description="Kling Image Generation Node. Generate an image from a text prompt with an optional reference image.", + inputs=[ + comfy_io.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + comfy_io.Combo.Input( + "image_type", + options=[i.value for i in KlingImageGenImageReferenceType], ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - KlingImageGenerationsRequest, - "negative_prompt", - multiline=True, - ), - "image_type": model_field_to_node_input( - IO.COMBO, - KlingImageGenerationsRequest, - "image_reference", - enum_type=KlingImageGenImageReferenceType, - ), - "image_fidelity": model_field_to_node_input( - IO.FLOAT, - KlingImageGenerationsRequest, + comfy_io.Float.Input( "image_fidelity", - slider=True, + default=0.5, + min=0.0, + max=1.0, step=0.01, + display_mode=comfy_io.NumberDisplay.slider, + tooltip="Reference intensity for user-uploaded images", ), - "human_fidelity": model_field_to_node_input( - IO.FLOAT, - KlingImageGenerationsRequest, + comfy_io.Float.Input( "human_fidelity", - slider=True, + default=0.45, + min=0.0, + max=1.0, step=0.01, + display_mode=comfy_io.NumberDisplay.slider, + tooltip="Subject reference similarity", ), - "model_name": model_field_to_node_input( - IO.COMBO, - KlingImageGenerationsRequest, + comfy_io.Combo.Input( "model_name", - enum_type=KlingImageGenModelName, + options=[i.value for i in KlingImageGenModelName], + default="kling-v1", ), - "aspect_ratio": model_field_to_node_input( - IO.COMBO, - KlingImageGenerationsRequest, + comfy_io.Combo.Input( "aspect_ratio", - enum_type=KlingImageGenAspectRatio, + options=[i.value for i in KlingImageGenAspectRatio], + default="16:9", ), - "n": model_field_to_node_input( - IO.INT, - KlingImageGenerationsRequest, + comfy_io.Int.Input( "n", + default=1, + min=1, + max=9, + tooltip="Number of generated images", ), - }, - "optional": { - "image": (IO.IMAGE, {}), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - DESCRIPTION = "Kling Image Generation Node. Generate an image from a text prompt with an optional reference image." - - async def get_response( - self, - task_id: str, - auth_kwargs: Optional[dict[str, str]], - node_id: Optional[str] = None, - ) -> KlingImageGenerationsResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_IMAGE_GENERATIONS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingImageGenerationsResponse, - ), - result_url_extractor=get_images_urls_from_response, - estimated_duration=AVERAGE_DURATION_IMAGE_GEN, - node_id=node_id, + comfy_io.Image.Input("image", optional=True), + ], + outputs=[ + comfy_io.Image.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, ) - async def api_call( - self, + @classmethod + async def execute( + cls, model_name: KlingImageGenModelName, prompt: str, negative_prompt: str, @@ -1694,10 +1609,9 @@ class KlingImageGenerationNode(KlingImageGenerationBase): n: int, aspect_ratio: KlingImageGenAspectRatio, image: Optional[torch.Tensor] = None, - unique_id: Optional[str] = None, - **kwargs, - ): - self.validate_prompt(prompt, negative_prompt) + ) -> comfy_io.NodeOutput: + validate_string(prompt, field_name="prompt", min_length=1, max_length=MAX_PROMPT_LENGTH_IMAGE_GEN) + validate_string(negative_prompt, field_name="negative_prompt", max_length=MAX_PROMPT_LENGTH_IMAGE_GEN) if image is None: image_type = None @@ -1706,6 +1620,10 @@ class KlingImageGenerationNode(KlingImageGenerationBase): else: image = tensor_to_base64_string(image) + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } initial_operation = SynchronousOperation( endpoint=ApiEndpoint( path=PATH_IMAGE_GENERATIONS, @@ -1724,50 +1642,50 @@ class KlingImageGenerationNode(KlingImageGenerationBase): n=n, aspect_ratio=aspect_ratio, ), - auth_kwargs=kwargs, + auth_kwargs=auth, ) task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id + final_response = await poll_until_finished( + auth, + ApiEndpoint( + path=f"{PATH_IMAGE_GENERATIONS}/{task_id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=KlingImageGenerationsResponse, + ), + result_url_extractor=get_images_urls_from_response, + estimated_duration=AVERAGE_DURATION_IMAGE_GEN, + node_id=cls.hidden.unique_id, ) validate_image_result_response(final_response) images = get_images_from_response(final_response) - return (await image_result_to_node_output(images),) + return comfy_io.NodeOutput(await image_result_to_node_output(images)) -NODE_CLASS_MAPPINGS = { - "KlingCameraControls": KlingCameraControls, - "KlingTextToVideoNode": KlingTextToVideoNode, - "KlingImage2VideoNode": KlingImage2VideoNode, - "KlingCameraControlI2VNode": KlingCameraControlI2VNode, - "KlingCameraControlT2VNode": KlingCameraControlT2VNode, - "KlingStartEndFrameNode": KlingStartEndFrameNode, - "KlingVideoExtendNode": KlingVideoExtendNode, - "KlingLipSyncAudioToVideoNode": KlingLipSyncAudioToVideoNode, - "KlingLipSyncTextToVideoNode": KlingLipSyncTextToVideoNode, - "KlingVirtualTryOnNode": KlingVirtualTryOnNode, - "KlingImageGenerationNode": KlingImageGenerationNode, - "KlingSingleImageVideoEffectNode": KlingSingleImageVideoEffectNode, - "KlingDualCharacterVideoEffectNode": KlingDualCharacterVideoEffectNode, -} +class KlingExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + KlingCameraControls, + KlingTextToVideoNode, + KlingImage2VideoNode, + KlingCameraControlI2VNode, + KlingCameraControlT2VNode, + KlingStartEndFrameNode, + KlingVideoExtendNode, + KlingLipSyncAudioToVideoNode, + KlingLipSyncTextToVideoNode, + KlingVirtualTryOnNode, + KlingImageGenerationNode, + KlingSingleImageVideoEffectNode, + KlingDualCharacterVideoEffectNode, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "KlingCameraControls": "Kling Camera Controls", - "KlingTextToVideoNode": "Kling Text to Video", - "KlingImage2VideoNode": "Kling Image to Video", - "KlingCameraControlI2VNode": "Kling Image to Video (Camera Control)", - "KlingCameraControlT2VNode": "Kling Text to Video (Camera Control)", - "KlingStartEndFrameNode": "Kling Start-End Frame to Video", - "KlingVideoExtendNode": "Kling Video Extend", - "KlingLipSyncAudioToVideoNode": "Kling Lip Sync Video with Audio", - "KlingLipSyncTextToVideoNode": "Kling Lip Sync Video with Text", - "KlingVirtualTryOnNode": "Kling Virtual Try On", - "KlingImageGenerationNode": "Kling Image Generation", - "KlingSingleImageVideoEffectNode": "Kling Video Effects", - "KlingDualCharacterVideoEffectNode": "Kling Dual Character Video Effects", -} + +async def comfy_entrypoint() -> KlingExtension: + return KlingExtension() From 8aea746212dc1bb1601b4dc5e8c8093d2221d89c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 6 Oct 2025 19:08:08 -0700 Subject: [PATCH 411/759] Implement gemma 3 as a text encoder. (#10241) Not useful yet. --- comfy/model_detection.py | 4 +- comfy/sd.py | 7 ++ comfy/text_encoders/llama.py | 133 +++++++++++++++++++++++++++------ comfy/text_encoders/lumina2.py | 26 ++++++- 4 files changed, 142 insertions(+), 28 deletions(-) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 46415c17a..7677617c0 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -365,8 +365,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["patch_size"] = 2 dit_config["in_channels"] = 16 dit_config["dim"] = 2304 - dit_config["cap_feat_dim"] = 2304 - dit_config["n_layers"] = 26 + dit_config["cap_feat_dim"] = state_dict['{}cap_embedder.1.weight'.format(key_prefix)].shape[1] + dit_config["n_layers"] = count_blocks(state_dict_keys, '{}layers.'.format(key_prefix) + '{}.') dit_config["n_heads"] = 24 dit_config["n_kv_heads"] = 8 dit_config["qk_norm"] = True diff --git a/comfy/sd.py b/comfy/sd.py index be225ad03..f2d95f85a 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -890,6 +890,7 @@ class TEModel(Enum): QWEN25_3B = 10 QWEN25_7B = 11 BYT5_SMALL_GLYPH = 12 + GEMMA_3_4B = 13 def detect_te_model(sd): if "text_model.encoder.layers.30.mlp.fc1.weight" in sd: @@ -912,6 +913,8 @@ def detect_te_model(sd): return TEModel.BYT5_SMALL_GLYPH return TEModel.T5_BASE if 'model.layers.0.post_feedforward_layernorm.weight' in sd: + if 'model.layers.0.self_attn.q_norm.weight' in sd: + return TEModel.GEMMA_3_4B return TEModel.GEMMA_2_2B if 'model.layers.0.self_attn.k_proj.bias' in sd: weight = sd['model.layers.0.self_attn.k_proj.bias'] @@ -1016,6 +1019,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) + elif te_model == TEModel.GEMMA_3_4B: + clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data), model_type="gemma3_4b") + clip_target.tokenizer = comfy.text_encoders.lumina2.NTokenizer + tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) elif te_model == TEModel.LLAMA3_8: clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**llama_detect(clip_data), clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None) diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index c5a48ba9f..c050759fe 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -3,6 +3,7 @@ import torch.nn as nn from dataclasses import dataclass from typing import Optional, Any import math +import logging from comfy.ldm.modules.attention import optimized_attention_for_device import comfy.model_management @@ -28,6 +29,9 @@ class Llama2Config: mlp_activation = "silu" qkv_bias = False rope_dims = None + q_norm = None + k_norm = None + rope_scale = None @dataclass class Qwen25_3BConfig: @@ -46,6 +50,9 @@ class Qwen25_3BConfig: mlp_activation = "silu" qkv_bias = True rope_dims = None + q_norm = None + k_norm = None + rope_scale = None @dataclass class Qwen25_7BVLI_Config: @@ -64,6 +71,9 @@ class Qwen25_7BVLI_Config: mlp_activation = "silu" qkv_bias = True rope_dims = [16, 24, 24] + q_norm = None + k_norm = None + rope_scale = None @dataclass class Gemma2_2B_Config: @@ -82,6 +92,32 @@ class Gemma2_2B_Config: mlp_activation = "gelu_pytorch_tanh" qkv_bias = False rope_dims = None + q_norm = None + k_norm = None + sliding_attention = None + rope_scale = None + +@dataclass +class Gemma3_4B_Config: + vocab_size: int = 262208 + hidden_size: int = 2560 + intermediate_size: int = 10240 + num_hidden_layers: int = 34 + num_attention_heads: int = 8 + num_key_value_heads: int = 4 + max_position_embeddings: int = 131072 + rms_norm_eps: float = 1e-6 + rope_theta = [10000.0, 1000000.0] + transformer_type: str = "gemma3" + head_dim = 256 + rms_norm_add = True + mlp_activation = "gelu_pytorch_tanh" + qkv_bias = False + rope_dims = None + q_norm = "gemma3" + k_norm = "gemma3" + sliding_attention = [False, False, False, False, False, 1024] + rope_scale = [1.0, 8.0] class RMSNorm(nn.Module): def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None): @@ -106,25 +142,40 @@ def rotate_half(x): return torch.cat((-x2, x1), dim=-1) -def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=None): - theta_numerator = torch.arange(0, head_dim, 2, device=device).float() - inv_freq = 1.0 / (theta ** (theta_numerator / head_dim)) +def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None): + if not isinstance(theta, list): + theta = [theta] - inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) - position_ids_expanded = position_ids[:, None, :].float() - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() - sin = emb.sin() - if rope_dims is not None and position_ids.shape[0] > 1: - mrope_section = rope_dims * 2 - cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) - sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) - else: - cos = cos.unsqueeze(1) - sin = sin.unsqueeze(1) + out = [] + for index, t in enumerate(theta): + theta_numerator = torch.arange(0, head_dim, 2, device=device).float() + inv_freq = 1.0 / (t ** (theta_numerator / head_dim)) - return (cos, sin) + if rope_scale is not None: + if isinstance(rope_scale, list): + inv_freq /= rope_scale[index] + else: + inv_freq /= rope_scale + + inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + if rope_dims is not None and position_ids.shape[0] > 1: + mrope_section = rope_dims * 2 + cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) + sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) + else: + cos = cos.unsqueeze(1) + sin = sin.unsqueeze(1) + out.append((cos, sin)) + + if len(out) == 1: + return out[0] + + return out def apply_rope(xq, xk, freqs_cis): @@ -152,6 +203,14 @@ class Attention(nn.Module): self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=config.qkv_bias, device=device, dtype=dtype) self.o_proj = ops.Linear(self.inner_size, config.hidden_size, bias=False, device=device, dtype=dtype) + self.q_norm = None + self.k_norm = None + + if config.q_norm == "gemma3": + self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) + if config.k_norm == "gemma3": + self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) + def forward( self, hidden_states: torch.Tensor, @@ -168,6 +227,11 @@ class Attention(nn.Module): xk = xk.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2) xv = xv.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2) + if self.q_norm is not None: + xq = self.q_norm(xq) + if self.k_norm is not None: + xk = self.k_norm(xk) + xq, xk = apply_rope(xq, xk, freqs_cis=freqs_cis) xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1) @@ -192,7 +256,7 @@ class MLP(nn.Module): return self.down_proj(self.activation(self.gate_proj(x)) * self.up_proj(x)) class TransformerBlock(nn.Module): - def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None): + def __init__(self, config: Llama2Config, index, device=None, dtype=None, ops: Any = None): super().__init__() self.self_attn = Attention(config, device=device, dtype=dtype, ops=ops) self.mlp = MLP(config, device=device, dtype=dtype, ops=ops) @@ -226,7 +290,7 @@ class TransformerBlock(nn.Module): return x class TransformerBlockGemma2(nn.Module): - def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None): + def __init__(self, config: Llama2Config, index, device=None, dtype=None, ops: Any = None): super().__init__() self.self_attn = Attention(config, device=device, dtype=dtype, ops=ops) self.mlp = MLP(config, device=device, dtype=dtype, ops=ops) @@ -235,6 +299,13 @@ class TransformerBlockGemma2(nn.Module): self.pre_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) self.post_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) + if config.sliding_attention is not None: # TODO: implement. (Not that necessary since models are trained on less than 1024 tokens) + self.sliding_attention = config.sliding_attention[index % len(config.sliding_attention)] + else: + self.sliding_attention = False + + self.transformer_type = config.transformer_type + def forward( self, x: torch.Tensor, @@ -242,6 +313,14 @@ class TransformerBlockGemma2(nn.Module): freqs_cis: Optional[torch.Tensor] = None, optimized_attention=None, ): + if self.transformer_type == 'gemma3': + if self.sliding_attention: + if x.shape[1] > self.sliding_attention: + logging.warning("Warning: sliding attention not implemented, results may be incorrect") + freqs_cis = freqs_cis[1] + else: + freqs_cis = freqs_cis[0] + # Self Attention residual = x x = self.input_layernorm(x) @@ -276,7 +355,7 @@ class Llama2_(nn.Module): device=device, dtype=dtype ) - if self.config.transformer_type == "gemma2": + if self.config.transformer_type == "gemma2" or self.config.transformer_type == "gemma3": transformer = TransformerBlockGemma2 self.normalize_in = True else: @@ -284,8 +363,8 @@ class Llama2_(nn.Module): self.normalize_in = False self.layers = nn.ModuleList([ - transformer(config, device=device, dtype=dtype, ops=ops) - for _ in range(config.num_hidden_layers) + transformer(config, index=i, device=device, dtype=dtype, ops=ops) + for i in range(config.num_hidden_layers) ]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype) @@ -305,6 +384,7 @@ class Llama2_(nn.Module): freqs_cis = precompute_freqs_cis(self.config.head_dim, position_ids, self.config.rope_theta, + self.config.rope_scale, self.config.rope_dims, device=x.device) @@ -433,3 +513,12 @@ class Gemma2_2B(BaseLlama, torch.nn.Module): self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) self.dtype = dtype + +class Gemma3_4B(BaseLlama, torch.nn.Module): + def __init__(self, config_dict, dtype, device, operations): + super().__init__() + config = Gemma3_4B_Config(**config_dict) + self.num_layers = config.num_hidden_layers + + self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) + self.dtype = dtype diff --git a/comfy/text_encoders/lumina2.py b/comfy/text_encoders/lumina2.py index 674461b75..fd986e2c1 100644 --- a/comfy/text_encoders/lumina2.py +++ b/comfy/text_encoders/lumina2.py @@ -11,23 +11,41 @@ class Gemma2BTokenizer(sd1_clip.SDTokenizer): def state_dict(self): return {"spiece_model": self.tokenizer.serialize_model()} +class Gemma3_4BTokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer = tokenizer_data.get("spiece_model", None) + super().__init__(tokenizer, pad_with_end=False, embedding_size=2560, embedding_key='gemma3_4b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data) + + def state_dict(self): + return {"spiece_model": self.tokenizer.serialize_model()} class LuminaTokenizer(sd1_clip.SD1Tokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma2_2b", tokenizer=Gemma2BTokenizer) +class NTokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma3_4b", tokenizer=Gemma3_4BTokenizer) class Gemma2_2BModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}): super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma2_2B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) +class Gemma3_4BModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}): + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_4B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) class LuminaModel(sd1_clip.SD1ClipModel): - def __init__(self, device="cpu", dtype=None, model_options={}): - super().__init__(device=device, dtype=dtype, name="gemma2_2b", clip_model=Gemma2_2BModel, model_options=model_options) + def __init__(self, device="cpu", dtype=None, model_options={}, name="gemma2_2b", clip_model=Gemma2_2BModel): + super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options) -def te(dtype_llama=None, llama_scaled_fp8=None): +def te(dtype_llama=None, llama_scaled_fp8=None, model_type="gemma2_2b"): + if model_type == "gemma2_2b": + model = Gemma2_2BModel + elif model_type == "gemma3_4b": + model = Gemma3_4BModel + class LuminaTEModel_(LuminaModel): def __init__(self, device="cpu", dtype=None, model_options={}): if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: @@ -35,5 +53,5 @@ def te(dtype_llama=None, llama_scaled_fp8=None): model_options["scaled_fp8"] = llama_scaled_fp8 if dtype_llama is not None: dtype = dtype_llama - super().__init__(device=device, dtype=dtype, model_options=model_options) + super().__init__(device=device, dtype=dtype, name=model_type, model_options=model_options, clip_model=model) return LuminaTEModel_ From fc34c3d1125e970699dcb311323839ed6dda4985 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 7 Oct 2025 23:15:32 +0300 Subject: [PATCH 412/759] fix(ReCraft-API-node): allow custom multipart parser to return FormData (#10244) --- comfy_api_nodes/apis/client.py | 17 ++++++++++------- comfy_api_nodes/nodes_recraft.py | 28 ++++++++++++++++++++++------ 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py index 79de3c262..36560c9e3 100644 --- a/comfy_api_nodes/apis/client.py +++ b/comfy_api_nodes/apis/client.py @@ -220,13 +220,16 @@ class ApiClient: if multipart_parser and data: data = multipart_parser(data) - form = aiohttp.FormData(default_to_multipart=True) - if data: # regular text fields - for k, v in data.items(): - if v is None: - continue # aiohttp fails to serialize "None" values - # aiohttp expects strings or bytes; convert enums etc. - form.add_field(k, str(v) if not isinstance(v, (bytes, bytearray)) else v) + if isinstance(data, aiohttp.FormData): + form = data # If the parser already returned a FormData, pass it through + else: + form = aiohttp.FormData(default_to_multipart=True) + if data: # regular text fields + for k, v in data.items(): + if v is None: + continue # aiohttp fails to serialize "None" values + # aiohttp expects strings or bytes; convert enums etc. + form.add_field(k, str(v) if not isinstance(v, (bytes, bytearray)) else v) if files: file_iter = files if isinstance(files, list) else files.items() diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index 0bbb551b8..8beed5675 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -35,6 +35,7 @@ from server import PromptServer import torch from io import BytesIO from PIL import UnidentifiedImageError +import aiohttp async def handle_recraft_file_request( @@ -82,10 +83,16 @@ async def handle_recraft_file_request( return all_bytesio -def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, converted_to_check: list[list]=None, is_list=False) -> dict: +def recraft_multipart_parser( + data, + parent_key=None, + formatter: callable = None, + converted_to_check: list[list] = None, + is_list: bool = False, + return_mode: str = "formdata" # "dict" | "formdata" +) -> dict | aiohttp.FormData: """ - Formats data such that multipart/form-data will work with requests library - when both files and data are present. + Formats data such that multipart/form-data will work with aiohttp library when both files and data are present. The OpenAI client that Recraft uses has a bizarre way of serializing lists: @@ -103,19 +110,19 @@ def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, co # Modification of a function that handled a different type of multipart parsing, big ups: # https://gist.github.com/kazqvaizer/4cebebe5db654a414132809f9f88067b - def handle_converted_lists(data, parent_key, lists_to_check=tuple[list]): + def handle_converted_lists(item, parent_key, lists_to_check=tuple[list]): # if list already exists exists, just extend list with data for check_list in lists_to_check: for conv_tuple in check_list: if conv_tuple[0] == parent_key and isinstance(conv_tuple[1], list): - conv_tuple[1].append(formatter(data)) + conv_tuple[1].append(formatter(item)) return True return False if converted_to_check is None: converted_to_check = [] - + effective_mode = return_mode if parent_key is None else "dict" if formatter is None: formatter = lambda v: v # Multipart representation of value @@ -145,6 +152,15 @@ def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, co else: converted.append((current_key, formatter(value))) + if effective_mode == "formdata": + fd = aiohttp.FormData() + for k, v in dict(converted).items(): + if isinstance(v, list): + for item in v: + fd.add_field(k, str(item)) + else: + fd.add_field(k, str(v)) + return fd return dict(converted) From 9e984c48bc6a1d1c82231c46542995dbf5a265d7 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 8 Oct 2025 00:11:37 +0300 Subject: [PATCH 413/759] feat(api-nodes): add Sora2 API node (#10249) --- comfy_api_nodes/apinode_utils.py | 23 +++- comfy_api_nodes/nodes_sora.py | 175 +++++++++++++++++++++++++++++++ nodes.py | 1 + 3 files changed, 194 insertions(+), 5 deletions(-) create mode 100644 comfy_api_nodes/nodes_sora.py diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py index 5ac3b92aa..571c74286 100644 --- a/comfy_api_nodes/apinode_utils.py +++ b/comfy_api_nodes/apinode_utils.py @@ -18,7 +18,7 @@ from comfy_api_nodes.apis.client import ( UploadResponse, ) from server import PromptServer - +from comfy.cli_args import args import numpy as np from PIL import Image @@ -30,7 +30,9 @@ from io import BytesIO import av -async def download_url_to_video_output(video_url: str, timeout: int = None) -> VideoFromFile: +async def download_url_to_video_output( + video_url: str, timeout: int = None, auth_kwargs: Optional[dict[str, str]] = None +) -> VideoFromFile: """Downloads a video from a URL and returns a `VIDEO` output. Args: @@ -39,7 +41,7 @@ async def download_url_to_video_output(video_url: str, timeout: int = None) -> V Returns: A Comfy node `VIDEO` output. """ - video_io = await download_url_to_bytesio(video_url, timeout) + video_io = await download_url_to_bytesio(video_url, timeout, auth_kwargs=auth_kwargs) if video_io is None: error_msg = f"Failed to download video from {video_url}" logging.error(error_msg) @@ -164,7 +166,9 @@ def mimetype_to_extension(mime_type: str) -> str: return mime_type.split("/")[-1].lower() -async def download_url_to_bytesio(url: str, timeout: int = None) -> BytesIO: +async def download_url_to_bytesio( + url: str, timeout: int = None, auth_kwargs: Optional[dict[str, str]] = None +) -> BytesIO: """Downloads content from a URL using requests and returns it as BytesIO. Args: @@ -174,9 +178,18 @@ async def download_url_to_bytesio(url: str, timeout: int = None) -> BytesIO: Returns: BytesIO object containing the downloaded content. """ + headers = {} + if url.startswith("/proxy/"): + url = str(args.comfy_api_base).rstrip("/") + url + auth_token = auth_kwargs.get("auth_token") + comfy_api_key = auth_kwargs.get("comfy_api_key") + if auth_token: + headers["Authorization"] = f"Bearer {auth_token}" + elif comfy_api_key: + headers["X-API-KEY"] = comfy_api_key timeout_cfg = aiohttp.ClientTimeout(total=timeout) if timeout else None async with aiohttp.ClientSession(timeout=timeout_cfg) as session: - async with session.get(url) as resp: + async with session.get(url, headers=headers) as resp: resp.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX) return BytesIO(await resp.read()) diff --git a/comfy_api_nodes/nodes_sora.py b/comfy_api_nodes/nodes_sora.py new file mode 100644 index 000000000..2d532d637 --- /dev/null +++ b/comfy_api_nodes/nodes_sora.py @@ -0,0 +1,175 @@ +from typing import Optional +from typing_extensions import override + +import torch +from pydantic import BaseModel, Field +from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api_nodes.apis.client import ( + ApiEndpoint, + HttpMethod, + SynchronousOperation, + PollingOperation, + EmptyRequest, +) +from comfy_api_nodes.util.validation_utils import get_number_of_images + +from comfy_api_nodes.apinode_utils import ( + download_url_to_video_output, + tensor_to_bytesio, +) + +class Sora2GenerationRequest(BaseModel): + prompt: str = Field(...) + model: str = Field(...) + seconds: str = Field(...) + size: str = Field(...) + + +class Sora2GenerationResponse(BaseModel): + id: str = Field(...) + error: Optional[dict] = Field(None) + status: Optional[str] = Field(None) + + +class OpenAIVideoSora2(comfy_io.ComfyNode): + @classmethod + def define_schema(cls): + return comfy_io.Schema( + node_id="OpenAIVideoSora2", + display_name="OpenAI Sora - Video", + category="api node/video/Sora", + description="OpenAI video and audio generation.", + inputs=[ + comfy_io.Combo.Input( + "model", + options=["sora-2", "sora-2-pro"], + default="sora-2", + ), + comfy_io.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Guiding text; may be empty if an input image is present.", + ), + comfy_io.Combo.Input( + "size", + options=[ + "720x1280", + "1280x720", + "1024x1792", + "1792x1024", + ], + default="1280x720", + ), + comfy_io.Combo.Input( + "duration", + options=[4, 8, 12], + default=8, + ), + comfy_io.Image.Input( + "image", + optional=True, + ), + comfy_io.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, + optional=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + ], + outputs=[ + comfy_io.Video.Output(), + ], + hidden=[ + comfy_io.Hidden.auth_token_comfy_org, + comfy_io.Hidden.api_key_comfy_org, + comfy_io.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + size: str = "1280x720", + duration: int = 8, + seed: int = 0, + image: Optional[torch.Tensor] = None, + ): + if model == "sora-2" and size not in ("720x1280", "1280x720"): + raise ValueError("Invalid size for sora-2 model, only 720x1280 and 1280x720 are supported.") + files_input = None + if image is not None: + if get_number_of_images(image) != 1: + raise ValueError("Currently only one input image is supported.") + files_input = {"input_reference": ("image.png", tensor_to_bytesio(image), "image/png")} + auth = { + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + } + payload = Sora2GenerationRequest( + model=model, + prompt=prompt, + seconds=str(duration), + size=size, + ) + initial_operation = SynchronousOperation( + endpoint=ApiEndpoint( + path="/proxy/openai/v1/videos", + method=HttpMethod.POST, + request_model=Sora2GenerationRequest, + response_model=Sora2GenerationResponse + ), + request=payload, + files=files_input, + auth_kwargs=auth, + content_type="multipart/form-data", + ) + initial_response = await initial_operation.execute() + if initial_response.error: + raise Exception(initial_response.error.message) + + model_time_multiplier = 1 if model == "sora-2" else 2 + poll_operation = PollingOperation( + poll_endpoint=ApiEndpoint( + path=f"/proxy/openai/v1/videos/{initial_response.id}", + method=HttpMethod.GET, + request_model=EmptyRequest, + response_model=Sora2GenerationResponse + ), + completed_statuses=["completed"], + failed_statuses=["failed"], + status_extractor=lambda x: x.status, + auth_kwargs=auth, + poll_interval=8.0, + max_poll_attempts=160, + node_id=cls.hidden.unique_id, + estimated_duration=45 * (duration / 4) * model_time_multiplier, + ) + await poll_operation.execute() + return comfy_io.NodeOutput( + await download_url_to_video_output( + f"/proxy/openai/v1/videos/{initial_response.id}/content", + auth_kwargs=auth, + ) + ) + + +class OpenAISoraExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + return [ + OpenAIVideoSora2, + ] + + +async def comfy_entrypoint() -> OpenAISoraExtension: + return OpenAISoraExtension() diff --git a/nodes.py b/nodes.py index 88d712993..2a2a5f2ad 100644 --- a/nodes.py +++ b/nodes.py @@ -2357,6 +2357,7 @@ async def init_builtin_api_nodes(): "nodes_stability.py", "nodes_pika.py", "nodes_runway.py", + "nodes_sora.py", "nodes_tripo.py", "nodes_moonvalley.py", "nodes_rodin.py", From 8a15568f10c0622a7281c32fadffc51511e53c10 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 7 Oct 2025 16:55:23 -0700 Subject: [PATCH 414/759] Temp fix for LTXV custom nodes. (#10251) --- comfy_extras/nodes_lt.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index b51d15804..50da5f4eb 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -34,6 +34,7 @@ class EmptyLTXVLatentVideo(io.ComfyNode): latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device()) return io.NodeOutput({"samples": latent}) + generate = execute # TODO: remove class LTXVImgToVideo(io.ComfyNode): @classmethod @@ -77,6 +78,8 @@ class LTXVImgToVideo(io.ComfyNode): return io.NodeOutput(positive, negative, {"samples": latent, "noise_mask": conditioning_latent_frames_mask}) + generate = execute # TODO: remove + def conditioning_get_any_value(conditioning, key, default=None): for t in conditioning: @@ -264,6 +267,8 @@ class LTXVAddGuide(io.ComfyNode): return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask}) + generate = execute # TODO: remove + class LTXVCropGuides(io.ComfyNode): @classmethod @@ -300,6 +305,8 @@ class LTXVCropGuides(io.ComfyNode): return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask}) + crop = execute # TODO: remove + class LTXVConditioning(io.ComfyNode): @classmethod @@ -498,6 +505,7 @@ class LTXVPreprocess(io.ComfyNode): output_images.append(preprocess(image[i], img_compression)) return io.NodeOutput(torch.stack(output_images)) + preprocess = execute # TODO: remove class LtxvExtension(ComfyExtension): @override From 19f595b788bd227004a5f7232f3b5895b46411ea Mon Sep 17 00:00:00 2001 From: filtered <176114999+webfiltered@users.noreply.github.com> Date: Wed, 8 Oct 2025 11:54:00 +1100 Subject: [PATCH 415/759] Bump frontend to 1.27.10 (#10252) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index db0486960..85b3bb63b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.27.7 +comfyui-frontend-package==1.27.10 comfyui-workflow-templates==0.1.93 comfyui-embedded-docs==0.2.6 torch From 51697d50dc94005b1c279eb0cf45207697946020 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 8 Oct 2025 10:48:51 +0800 Subject: [PATCH 416/759] update template to 0.1.94 (#10253) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 85b3bb63b..d4594df39 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.27.10 -comfyui-workflow-templates==0.1.93 +comfyui-workflow-templates==0.1.94 comfyui-embedded-docs==0.2.6 torch torchsde From 637221995f7424a561bd825de3e61ea117dfe1e3 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 8 Oct 2025 00:53:43 -0400 Subject: [PATCH 417/759] ComfyUI version 0.3.64 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index c3257d4bf..da5cde02d 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.63" +__version__ = "0.3.64" diff --git a/pyproject.toml b/pyproject.toml index abd1a5f5c..6ea839336 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.63" +version = "0.3.64" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 3e0eb8d33f9a65f2a01430f1b4a1535348af881c Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 8 Oct 2025 10:14:04 +0300 Subject: [PATCH 418/759] feat(V3-io): allow Enum classes for Combo options (#10237) --- comfy_api/latest/_io.py | 24 ++++++++++++++++----- comfy_api_nodes/nodes_bytedance.py | 16 +++++++------- comfy_api_nodes/nodes_kling.py | 20 +++++++++--------- comfy_api_nodes/nodes_luma.py | 18 ++++++++-------- comfy_api_nodes/nodes_pika.py | 6 +++--- comfy_api_nodes/nodes_pixverse.py | 22 +++++++++---------- comfy_api_nodes/nodes_runway.py | 12 +++++------ comfy_api_nodes/nodes_stability.py | 10 ++++----- comfy_api_nodes/nodes_vidu.py | 34 +++++++++++++++--------------- 9 files changed, 88 insertions(+), 74 deletions(-) diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 2d95cffd6..661309f19 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -336,11 +336,25 @@ class Combo(ComfyTypeIO): class Input(WidgetInput): """Combo input (dropdown).""" Type = str - def __init__(self, id: str, options: list[str]=None, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, - default: str=None, control_after_generate: bool=None, - upload: UploadType=None, image_folder: FolderType=None, - remote: RemoteOptions=None, - socketless: bool=None): + def __init__( + self, + id: str, + options: list[str] | list[int] | type[Enum] = None, + display_name: str=None, + optional=False, + tooltip: str=None, + lazy: bool=None, + default: str | int | Enum = None, + control_after_generate: bool=None, + upload: UploadType=None, + image_folder: FolderType=None, + remote: RemoteOptions=None, + socketless: bool=None, + ): + if isinstance(options, type) and issubclass(options, Enum): + options = [v.value for v in options] + if isinstance(default, Enum): + default = default.value super().__init__(id, display_name, optional, tooltip, lazy, default, socketless) self.multiselect = False self.options = options diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index 654d6a362..fcb01820c 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -249,8 +249,8 @@ class ByteDanceImageNode(comfy_io.ComfyNode): inputs=[ comfy_io.Combo.Input( "model", - options=[model.value for model in Text2ImageModelName], - default=Text2ImageModelName.seedream_3.value, + options=Text2ImageModelName, + default=Text2ImageModelName.seedream_3, tooltip="Model name", ), comfy_io.String.Input( @@ -382,8 +382,8 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode): inputs=[ comfy_io.Combo.Input( "model", - options=[model.value for model in Image2ImageModelName], - default=Image2ImageModelName.seededit_3.value, + options=Image2ImageModelName, + default=Image2ImageModelName.seededit_3, tooltip="Model name", ), comfy_io.Image.Input( @@ -676,8 +676,8 @@ class ByteDanceTextToVideoNode(comfy_io.ComfyNode): inputs=[ comfy_io.Combo.Input( "model", - options=[model.value for model in Text2VideoModelName], - default=Text2VideoModelName.seedance_1_pro.value, + options=Text2VideoModelName, + default=Text2VideoModelName.seedance_1_pro, tooltip="Model name", ), comfy_io.String.Input( @@ -793,8 +793,8 @@ class ByteDanceImageToVideoNode(comfy_io.ComfyNode): inputs=[ comfy_io.Combo.Input( "model", - options=[model.value for model in Image2VideoModelName], - default=Image2VideoModelName.seedance_1_pro.value, + options=Image2VideoModelName, + default=Image2VideoModelName.seedance_1_pro, tooltip="Model name", ), comfy_io.String.Input( diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 457b43451..fe5b8562d 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -647,7 +647,7 @@ class KlingCameraControls(comfy_io.ComfyNode): category="api node/video/Kling", description="Allows specifying configuration options for Kling Camera Controls and motion control effects.", inputs=[ - comfy_io.Combo.Input("camera_control_type", options=[i.value for i in KlingCameraControlType]), + comfy_io.Combo.Input("camera_control_type", options=KlingCameraControlType), comfy_io.Float.Input( "horizontal_movement", default=0.0, @@ -772,7 +772,7 @@ class KlingTextToVideoNode(comfy_io.ComfyNode): comfy_io.Float.Input("cfg_scale", default=1.0, min=0.0, max=1.0), comfy_io.Combo.Input( "aspect_ratio", - options=[i.value for i in KlingVideoGenAspectRatio], + options=KlingVideoGenAspectRatio, default="16:9", ), comfy_io.Combo.Input( @@ -840,7 +840,7 @@ class KlingCameraControlT2VNode(comfy_io.ComfyNode): comfy_io.Float.Input("cfg_scale", default=0.75, min=0.0, max=1.0), comfy_io.Combo.Input( "aspect_ratio", - options=[i.value for i in KlingVideoGenAspectRatio], + options=KlingVideoGenAspectRatio, default="16:9", ), comfy_io.Custom("CAMERA_CONTROL").Input( @@ -903,17 +903,17 @@ class KlingImage2VideoNode(comfy_io.ComfyNode): comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), comfy_io.Combo.Input( "model_name", - options=[i.value for i in KlingVideoGenModelName], + options=KlingVideoGenModelName, default="kling-v2-master", ), comfy_io.Float.Input("cfg_scale", default=0.8, min=0.0, max=1.0), - comfy_io.Combo.Input("mode", options=[i.value for i in KlingVideoGenMode], default="std"), + comfy_io.Combo.Input("mode", options=KlingVideoGenMode, default=KlingVideoGenMode.std), comfy_io.Combo.Input( "aspect_ratio", - options=[i.value for i in KlingVideoGenAspectRatio], - default="16:9", + options=KlingVideoGenAspectRatio, + default=KlingVideoGenAspectRatio.field_16_9, ), - comfy_io.Combo.Input("duration", options=[i.value for i in KlingVideoGenDuration], default="5"), + comfy_io.Combo.Input("duration", options=KlingVideoGenDuration, default=KlingVideoGenDuration.field_5), ], outputs=[ comfy_io.Video.Output(), @@ -984,8 +984,8 @@ class KlingCameraControlI2VNode(comfy_io.ComfyNode): comfy_io.Float.Input("cfg_scale", default=0.75, min=0.0, max=1.0), comfy_io.Combo.Input( "aspect_ratio", - options=[i.value for i in KlingVideoGenAspectRatio], - default="16:9", + options=KlingVideoGenAspectRatio, + default=KlingVideoGenAspectRatio.field_16_9, ), comfy_io.Custom("CAMERA_CONTROL").Input( "camera_control", diff --git a/comfy_api_nodes/nodes_luma.py b/comfy_api_nodes/nodes_luma.py index 9cd02ffd2..9cab2ca82 100644 --- a/comfy_api_nodes/nodes_luma.py +++ b/comfy_api_nodes/nodes_luma.py @@ -181,11 +181,11 @@ class LumaImageGenerationNode(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "model", - options=[model.value for model in LumaImageModel], + options=LumaImageModel, ), comfy_io.Combo.Input( "aspect_ratio", - options=[ratio.value for ratio in LumaAspectRatio], + options=LumaAspectRatio, default=LumaAspectRatio.ratio_16_9, ), comfy_io.Int.Input( @@ -366,7 +366,7 @@ class LumaImageModifyNode(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "model", - options=[model.value for model in LumaImageModel], + options=LumaImageModel, ), comfy_io.Int.Input( "seed", @@ -466,21 +466,21 @@ class LumaTextToVideoGenerationNode(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "model", - options=[model.value for model in LumaVideoModel], + options=LumaVideoModel, ), comfy_io.Combo.Input( "aspect_ratio", - options=[ratio.value for ratio in LumaAspectRatio], + options=LumaAspectRatio, default=LumaAspectRatio.ratio_16_9, ), comfy_io.Combo.Input( "resolution", - options=[resolution.value for resolution in LumaVideoOutputResolution], + options=LumaVideoOutputResolution, default=LumaVideoOutputResolution.res_540p, ), comfy_io.Combo.Input( "duration", - options=[dur.value for dur in LumaVideoModelOutputDuration], + options=LumaVideoModelOutputDuration, ), comfy_io.Boolean.Input( "loop", @@ -595,7 +595,7 @@ class LumaImageToVideoGenerationNode(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "model", - options=[model.value for model in LumaVideoModel], + options=LumaVideoModel, ), # comfy_io.Combo.Input( # "aspect_ratio", @@ -604,7 +604,7 @@ class LumaImageToVideoGenerationNode(comfy_io.ComfyNode): # ), comfy_io.Combo.Input( "resolution", - options=[resolution.value for resolution in LumaVideoOutputResolution], + options=LumaVideoOutputResolution, default=LumaVideoOutputResolution.res_540p, ), comfy_io.Combo.Input( diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py index 0a9f04cc2..35d6baf1c 100644 --- a/comfy_api_nodes/nodes_pika.py +++ b/comfy_api_nodes/nodes_pika.py @@ -174,10 +174,10 @@ def get_base_inputs_types() -> list[comfy_io.Input]: comfy_io.String.Input("negative_prompt", multiline=True), comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), comfy_io.Combo.Input( - "resolution", options=[resolution.value for resolution in PikaResolutionEnum], default="1080p" + "resolution", options=PikaResolutionEnum, default=PikaResolutionEnum.field_1080p ), comfy_io.Combo.Input( - "duration", options=[duration.value for duration in PikaDurationEnum], default=5 + "duration", options=PikaDurationEnum, default=PikaDurationEnum.integer_5 ), ] @@ -616,7 +616,7 @@ class PikaffectsNode(comfy_io.ComfyNode): inputs=[ comfy_io.Image.Input("image", tooltip="The reference image to apply the Pikaffect to."), comfy_io.Combo.Input( - "pikaffect", options=[pikaffect.value for pikaffect in Pikaffect], default="Cake-ify" + "pikaffect", options=Pikaffect, default="Cake-ify" ), comfy_io.String.Input("prompt_text", multiline=True), comfy_io.String.Input("negative_prompt", multiline=True), diff --git a/comfy_api_nodes/nodes_pixverse.py b/comfy_api_nodes/nodes_pixverse.py index 2c91bbc65..a97610f06 100644 --- a/comfy_api_nodes/nodes_pixverse.py +++ b/comfy_api_nodes/nodes_pixverse.py @@ -85,7 +85,7 @@ class PixverseTemplateNode(comfy_io.ComfyNode): display_name="PixVerse Template", category="api node/video/PixVerse", inputs=[ - comfy_io.Combo.Input("template", options=[list(pixverse_templates.keys())]), + comfy_io.Combo.Input("template", options=list(pixverse_templates.keys())), ], outputs=[comfy_io.Custom(PixverseIO.TEMPLATE).Output(display_name="pixverse_template")], ) @@ -120,20 +120,20 @@ class PixverseTextToVideoNode(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "aspect_ratio", - options=[ratio.value for ratio in PixverseAspectRatio], + options=PixverseAspectRatio, ), comfy_io.Combo.Input( "quality", - options=[resolution.value for resolution in PixverseQuality], + options=PixverseQuality, default=PixverseQuality.res_540p, ), comfy_io.Combo.Input( "duration_seconds", - options=[dur.value for dur in PixverseDuration], + options=PixverseDuration, ), comfy_io.Combo.Input( "motion_mode", - options=[mode.value for mode in PixverseMotionMode], + options=PixverseMotionMode, ), comfy_io.Int.Input( "seed", @@ -262,16 +262,16 @@ class PixverseImageToVideoNode(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "quality", - options=[resolution.value for resolution in PixverseQuality], + options=PixverseQuality, default=PixverseQuality.res_540p, ), comfy_io.Combo.Input( "duration_seconds", - options=[dur.value for dur in PixverseDuration], + options=PixverseDuration, ), comfy_io.Combo.Input( "motion_mode", - options=[mode.value for mode in PixverseMotionMode], + options=PixverseMotionMode, ), comfy_io.Int.Input( "seed", @@ -403,16 +403,16 @@ class PixverseTransitionVideoNode(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "quality", - options=[resolution.value for resolution in PixverseQuality], + options=PixverseQuality, default=PixverseQuality.res_540p, ), comfy_io.Combo.Input( "duration_seconds", - options=[dur.value for dur in PixverseDuration], + options=PixverseDuration, ), comfy_io.Combo.Input( "motion_mode", - options=[mode.value for mode in PixverseMotionMode], + options=PixverseMotionMode, ), comfy_io.Int.Input( "seed", diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py index 27b2bf748..ea22692cb 100644 --- a/comfy_api_nodes/nodes_runway.py +++ b/comfy_api_nodes/nodes_runway.py @@ -200,11 +200,11 @@ class RunwayImageToVideoNodeGen3a(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "duration", - options=[model.value for model in Duration], + options=Duration, ), comfy_io.Combo.Input( "ratio", - options=[model.value for model in RunwayGen3aAspectRatio], + options=RunwayGen3aAspectRatio, ), comfy_io.Int.Input( "seed", @@ -300,11 +300,11 @@ class RunwayImageToVideoNodeGen4(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "duration", - options=[model.value for model in Duration], + options=Duration, ), comfy_io.Combo.Input( "ratio", - options=[model.value for model in RunwayGen4TurboAspectRatio], + options=RunwayGen4TurboAspectRatio, ), comfy_io.Int.Input( "seed", @@ -408,11 +408,11 @@ class RunwayFirstLastFrameNode(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "duration", - options=[model.value for model in Duration], + options=Duration, ), comfy_io.Combo.Input( "ratio", - options=[model.value for model in RunwayGen3aAspectRatio], + options=RunwayGen3aAspectRatio, ), comfy_io.Int.Input( "seed", diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index 5ba5ed986..bfb67fc9d 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -82,8 +82,8 @@ class StabilityStableImageUltraNode(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "aspect_ratio", - options=[x.value for x in StabilityAspectRatio], - default=StabilityAspectRatio.ratio_1_1.value, + options=StabilityAspectRatio, + default=StabilityAspectRatio.ratio_1_1, tooltip="Aspect ratio of generated image.", ), comfy_io.Combo.Input( @@ -217,12 +217,12 @@ class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "model", - options=[x.value for x in Stability_SD3_5_Model], + options=Stability_SD3_5_Model, ), comfy_io.Combo.Input( "aspect_ratio", - options=[x.value for x in StabilityAspectRatio], - default=StabilityAspectRatio.ratio_1_1.value, + options=StabilityAspectRatio, + default=StabilityAspectRatio.ratio_1_1, tooltip="Aspect ratio of generated image.", ), comfy_io.Combo.Input( diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py index 2f441948c..ac28b683c 100644 --- a/comfy_api_nodes/nodes_vidu.py +++ b/comfy_api_nodes/nodes_vidu.py @@ -173,8 +173,8 @@ class ViduTextToVideoNode(comfy_io.ComfyNode): inputs=[ comfy_io.Combo.Input( "model", - options=[model.value for model in VideoModelName], - default=VideoModelName.vidu_q1.value, + options=VideoModelName, + default=VideoModelName.vidu_q1, tooltip="Model name", ), comfy_io.String.Input( @@ -205,22 +205,22 @@ class ViduTextToVideoNode(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "aspect_ratio", - options=[model.value for model in AspectRatio], - default=AspectRatio.r_16_9.value, + options=AspectRatio, + default=AspectRatio.r_16_9, tooltip="The aspect ratio of the output video", optional=True, ), comfy_io.Combo.Input( "resolution", - options=[model.value for model in Resolution], - default=Resolution.r_1080p.value, + options=Resolution, + default=Resolution.r_1080p, tooltip="Supported values may vary by model & duration", optional=True, ), comfy_io.Combo.Input( "movement_amplitude", - options=[model.value for model in MovementAmplitude], - default=MovementAmplitude.auto.value, + options=MovementAmplitude, + default=MovementAmplitude.auto, tooltip="The movement amplitude of objects in the frame", optional=True, ), @@ -278,8 +278,8 @@ class ViduImageToVideoNode(comfy_io.ComfyNode): inputs=[ comfy_io.Combo.Input( "model", - options=[model.value for model in VideoModelName], - default=VideoModelName.vidu_q1.value, + options=VideoModelName, + default=VideoModelName.vidu_q1, tooltip="Model name", ), comfy_io.Image.Input( @@ -316,14 +316,14 @@ class ViduImageToVideoNode(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "resolution", - options=[model.value for model in Resolution], - default=Resolution.r_1080p.value, + options=Resolution, + default=Resolution.r_1080p, tooltip="Supported values may vary by model & duration", optional=True, ), comfy_io.Combo.Input( "movement_amplitude", - options=[model.value for model in MovementAmplitude], + options=MovementAmplitude, default=MovementAmplitude.auto.value, tooltip="The movement amplitude of objects in the frame", optional=True, @@ -388,8 +388,8 @@ class ViduReferenceVideoNode(comfy_io.ComfyNode): inputs=[ comfy_io.Combo.Input( "model", - options=[model.value for model in VideoModelName], - default=VideoModelName.vidu_q1.value, + options=VideoModelName, + default=VideoModelName.vidu_q1, tooltip="Model name", ), comfy_io.Image.Input( @@ -424,8 +424,8 @@ class ViduReferenceVideoNode(comfy_io.ComfyNode): ), comfy_io.Combo.Input( "aspect_ratio", - options=[model.value for model in AspectRatio], - default=AspectRatio.r_16_9.value, + options=AspectRatio, + default=AspectRatio.r_16_9, tooltip="The aspect ratio of the output video", optional=True, ), From 6e59934089df3375e39db174340b6a937b226c83 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 8 Oct 2025 14:49:02 -0700 Subject: [PATCH 419/759] Refactor model sampling sigmas code. (#10250) --- comfy/model_sampling.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/comfy/model_sampling.py b/comfy/model_sampling.py index b240b7f29..2a00ed819 100644 --- a/comfy/model_sampling.py +++ b/comfy/model_sampling.py @@ -21,17 +21,23 @@ def rescale_zero_terminal_snr_sigmas(sigmas): alphas_bar[-1] = 4.8973451890853435e-08 return ((1 - alphas_bar) / alphas_bar) ** 0.5 +def reshape_sigma(sigma, noise_dim): + if sigma.nelement() == 1: + return sigma.view(()) + else: + return sigma.view(sigma.shape[:1] + (1,) * (noise_dim - 1)) + class EPS: def calculate_input(self, sigma, noise): - sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1)) + sigma = reshape_sigma(sigma, noise.ndim) return noise / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 def calculate_denoised(self, sigma, model_output, model_input): - sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) + sigma = reshape_sigma(sigma, model_output.ndim) return model_input - model_output * sigma def noise_scaling(self, sigma, noise, latent_image, max_denoise=False): - sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1)) + sigma = reshape_sigma(sigma, noise.ndim) if max_denoise: noise = noise * torch.sqrt(1.0 + sigma ** 2.0) else: @@ -45,12 +51,12 @@ class EPS: class V_PREDICTION(EPS): def calculate_denoised(self, sigma, model_output, model_input): - sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) + sigma = reshape_sigma(sigma, model_output.ndim) return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) - model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 class EDM(V_PREDICTION): def calculate_denoised(self, sigma, model_output, model_input): - sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) + sigma = reshape_sigma(sigma, model_output.ndim) return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) + model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 class CONST: @@ -58,15 +64,15 @@ class CONST: return noise def calculate_denoised(self, sigma, model_output, model_input): - sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) + sigma = reshape_sigma(sigma, model_output.ndim) return model_input - model_output * sigma def noise_scaling(self, sigma, noise, latent_image, max_denoise=False): - sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1)) + sigma = reshape_sigma(sigma, noise.ndim) return sigma * noise + (1.0 - sigma) * latent_image def inverse_noise_scaling(self, sigma, latent): - sigma = sigma.view(sigma.shape[:1] + (1,) * (latent.ndim - 1)) + sigma = reshape_sigma(sigma, latent.ndim) return latent / (1.0 - sigma) class X0(EPS): @@ -80,16 +86,16 @@ class IMG_TO_IMG(X0): class COSMOS_RFLOW: def calculate_input(self, sigma, noise): sigma = (sigma / (sigma + 1)) - sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1)) + sigma = reshape_sigma(sigma, noise.ndim) return noise * (1.0 - sigma) def calculate_denoised(self, sigma, model_output, model_input): sigma = (sigma / (sigma + 1)) - sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) + sigma = reshape_sigma(sigma, model_output.ndim) return model_input * (1.0 - sigma) - model_output * sigma def noise_scaling(self, sigma, noise, latent_image, max_denoise=False): - sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1)) + sigma = reshape_sigma(sigma, noise.ndim) noise = noise * sigma noise += latent_image return noise From 72c2071972d3207ed92bc20535299c5f39622818 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Wed, 8 Oct 2025 17:30:41 -0700 Subject: [PATCH 420/759] Mvly/node update (#10042) * updated V2V node to allow for control image input exposing steps in v2v fixing guidance_scale as input parameter TODO: allow for motion_intensity as input param. * refactor: comment out unsupported resolution and adjust default values in video nodes * set control_after_generate * adding new defaults * fixes * changed control_after_generate back to True * changed control_after_generate back to False --------- Co-authored-by: thorsten --- comfy_api_nodes/nodes_moonvalley.py | 60 ++++++++++++++++++----------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 55471a69d..4a56d31f8 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -335,7 +335,7 @@ def parse_width_height_from_res(resolution: str): "1:1 (1152 x 1152)": {"width": 1152, "height": 1152}, "4:3 (1536 x 1152)": {"width": 1536, "height": 1152}, "3:4 (1152 x 1536)": {"width": 1152, "height": 1536}, - "21:9 (2560 x 1080)": {"width": 2560, "height": 1080}, + # "21:9 (2560 x 1080)": {"width": 2560, "height": 1080}, } return res_map.get(resolution, {"width": 1920, "height": 1080}) @@ -388,11 +388,11 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): "negative_prompt", multiline=True, default=" gopro, bright, contrast, static, overexposed, vignette, " - "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " - "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " - "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " - "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " - "wobbly, weird, low quality, plastic, stock footage, video camera, boring", + "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " + "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " + "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " + "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " + "wobbly, weird, low quality, plastic, stock footage, video camera, boring", tooltip="Negative prompt text", ), comfy_io.Combo.Input( @@ -403,14 +403,14 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): "1:1 (1152 x 1152)", "4:3 (1536 x 1152)", "3:4 (1152 x 1536)", - "21:9 (2560 x 1080)", + # "21:9 (2560 x 1080)", ], default="16:9 (1920 x 1080)", tooltip="Resolution of the output video", ), comfy_io.Float.Input( "prompt_adherence", - default=10.0, + default=4.5, min=1.0, max=20.0, step=1.0, @@ -424,10 +424,11 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): step=1, display_mode=comfy_io.NumberDisplay.number, tooltip="Random seed value", + control_after_generate=True, ), comfy_io.Int.Input( "steps", - default=100, + default=33, min=1, max=100, step=1, @@ -468,7 +469,6 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): steps=steps, seed=seed, guidance_scale=prompt_adherence, - num_frames=128, width=width_height["width"], height=width_height["height"], use_negative_prompts=True, @@ -526,11 +526,11 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): "negative_prompt", multiline=True, default=" gopro, bright, contrast, static, overexposed, vignette, " - "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " - "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " - "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " - "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " - "wobbly, weird, low quality, plastic, stock footage, video camera, boring", + "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " + "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " + "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " + "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " + "wobbly, weird, low quality, plastic, stock footage, video camera, boring", tooltip="Negative prompt text", ), comfy_io.Int.Input( @@ -546,7 +546,7 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): comfy_io.Video.Input( "video", tooltip="The reference video used to generate the output video. Must be at least 5 seconds long. " - "Videos longer than 5s will be automatically trimmed. Only MP4 format supported.", + "Videos longer than 5s will be automatically trimmed. Only MP4 format supported.", ), comfy_io.Combo.Input( "control_type", @@ -563,6 +563,15 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): tooltip="Only used if control_type is 'Motion Transfer'", optional=True, ), + comfy_io.Int.Input( + "steps", + default=33, + min=1, + max=100, + step=1, + display_mode=comfy_io.NumberDisplay.number, + tooltip="Number of inference steps", + ), ], outputs=[comfy_io.Video.Output()], hidden=[ @@ -582,6 +591,8 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): video: Optional[VideoInput] = None, control_type: str = "Motion Transfer", motion_intensity: Optional[int] = 100, + steps=33, + prompt_adherence=4.5, ) -> comfy_io.NodeOutput: auth = { "auth_token": cls.hidden.auth_token_comfy_org, @@ -602,6 +613,8 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): negative_prompt=negative_prompt, seed=seed, control_params=control_params, + steps=steps, + guidance_scale=prompt_adherence, ) control = parse_control_parameter(control_type) @@ -653,11 +666,11 @@ class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): "negative_prompt", multiline=True, default=" gopro, bright, contrast, static, overexposed, vignette, " - "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " - "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " - "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " - "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " - "wobbly, weird, low quality, plastic, stock footage, video camera, boring", + "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " + "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " + "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " + "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " + "wobbly, weird, low quality, plastic, stock footage, video camera, boring", tooltip="Negative prompt text", ), comfy_io.Combo.Input( @@ -675,7 +688,7 @@ class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): ), comfy_io.Float.Input( "prompt_adherence", - default=10.0, + default=4.0, min=1.0, max=20.0, step=1.0, @@ -688,11 +701,12 @@ class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): max=4294967295, step=1, display_mode=comfy_io.NumberDisplay.number, + control_after_generate=True, tooltip="Random seed value", ), comfy_io.Int.Input( "steps", - default=100, + default=33, min=1, max=100, step=1, From 51fb505ffa7cdae113ef4303f9ef45a06d668a90 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 9 Oct 2025 09:06:56 +0300 Subject: [PATCH 421/759] feat(api-nodes, pylint): use lazy formatting in logging functions (#10248) --- comfy_api_nodes/apinode_utils.py | 2 +- comfy_api_nodes/apis/client.py | 55 +++++++++++++++----------- comfy_api_nodes/apis/request_logger.py | 6 +-- comfy_api_nodes/nodes_kling.py | 4 +- comfy_api_nodes/nodes_minimax.py | 2 +- comfy_api_nodes/nodes_moonvalley.py | 14 +++---- comfy_api_nodes/nodes_rodin.py | 10 ++--- comfy_api_nodes/nodes_veo2.py | 2 +- pyproject.toml | 1 - 9 files changed, 50 insertions(+), 46 deletions(-) diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py index 571c74286..2e0dc4dc1 100644 --- a/comfy_api_nodes/apinode_utils.py +++ b/comfy_api_nodes/apinode_utils.py @@ -431,7 +431,7 @@ async def upload_video_to_comfyapi( f"Video duration ({actual_duration:.2f}s) exceeds the maximum allowed ({max_duration}s)." ) except Exception as e: - logging.error(f"Error getting video duration: {e}") + logging.error("Error getting video duration: %s", str(e)) raise ValueError(f"Could not verify video duration from source: {e}") from e upload_mime_type = f"video/{container.value.lower()}" diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py index 36560c9e3..a3ceafbae 100644 --- a/comfy_api_nodes/apis/client.py +++ b/comfy_api_nodes/apis/client.py @@ -359,10 +359,10 @@ class ApiClient: if params: params = {k: v for k, v in params.items() if v is not None} # aiohttp fails to serialize None values - logging.debug(f"[DEBUG] Request Headers: {request_headers}") - logging.debug(f"[DEBUG] Files: {files}") - logging.debug(f"[DEBUG] Params: {params}") - logging.debug(f"[DEBUG] Data: {data}") + logging.debug("[DEBUG] Request Headers: %s", request_headers) + logging.debug("[DEBUG] Files: %s", files) + logging.debug("[DEBUG] Params: %s", params) + logging.debug("[DEBUG] Data: %s", data) if content_type == "application/x-www-form-urlencoded": payload_args = self._create_urlencoded_form_data_args(data or {}, request_headers) @@ -592,9 +592,9 @@ class ApiClient: error_message=f"HTTP Error {exc.status}", ) - logging.debug(f"[DEBUG] API Error: {user_friendly} (Status: {status_code})") + logging.debug("[DEBUG] API Error: %s (Status: %s)", user_friendly, status_code) if response_content: - logging.debug(f"[DEBUG] Response content: {response_content}") + logging.debug("[DEBUG] Response content: %s", response_content) # Retry if eligible if status_code in self.retry_status_codes and retry_count < self.max_retries: @@ -738,11 +738,9 @@ class SynchronousOperation(Generic[T, R]): if isinstance(v, Enum): request_dict[k] = v.value - logging.debug( - f"[DEBUG] API Request: {self.endpoint.method.value} {self.endpoint.path}" - ) - logging.debug(f"[DEBUG] Request Data: {json.dumps(request_dict, indent=2)}") - logging.debug(f"[DEBUG] Query Params: {self.endpoint.query_params}") + logging.debug("[DEBUG] API Request: %s %s", self.endpoint.method.value, self.endpoint.path) + logging.debug("[DEBUG] Request Data: %s", json.dumps(request_dict, indent=2)) + logging.debug("[DEBUG] Query Params: %s", self.endpoint.query_params) response_json = await client.request( self.endpoint.method.value, @@ -757,11 +755,11 @@ class SynchronousOperation(Generic[T, R]): logging.debug("=" * 50) logging.debug("[DEBUG] RESPONSE DETAILS:") logging.debug("[DEBUG] Status Code: 200 (Success)") - logging.debug(f"[DEBUG] Response Body: {json.dumps(response_json, indent=2)}") + logging.debug("[DEBUG] Response Body: %s", json.dumps(response_json, indent=2)) logging.debug("=" * 50) parsed_response = self.endpoint.response_model.model_validate(response_json) - logging.debug(f"[DEBUG] Parsed Response: {parsed_response}") + logging.debug("[DEBUG] Parsed Response: %s", parsed_response) return parsed_response finally: if owns_client: @@ -877,7 +875,7 @@ class PollingOperation(Generic[T, R]): status = TaskStatus.PENDING for poll_count in range(1, self.max_poll_attempts + 1): try: - logging.debug(f"[DEBUG] Polling attempt #{poll_count}") + logging.debug("[DEBUG] Polling attempt #%s", poll_count) request_dict = ( None if self.request is None else self.request.model_dump(exclude_none=True) @@ -885,10 +883,13 @@ class PollingOperation(Generic[T, R]): if poll_count == 1: logging.debug( - f"[DEBUG] Poll Request: {self.poll_endpoint.method.value} {self.poll_endpoint.path}" + "[DEBUG] Poll Request: %s %s", + self.poll_endpoint.method.value, + self.poll_endpoint.path, ) logging.debug( - f"[DEBUG] Poll Request Data: {json.dumps(request_dict, indent=2) if request_dict else 'None'}" + "[DEBUG] Poll Request Data: %s", + json.dumps(request_dict, indent=2) if request_dict else "None", ) # Query task status @@ -903,7 +904,7 @@ class PollingOperation(Generic[T, R]): # Check if task is complete status = self._check_task_status(response_obj) - logging.debug(f"[DEBUG] Task Status: {status}") + logging.debug("[DEBUG] Task Status: %s", status) # If progress extractor is provided, extract progress if self.progress_extractor: @@ -917,7 +918,7 @@ class PollingOperation(Generic[T, R]): result_url = self.result_url_extractor(response_obj) if result_url: message = f"Result URL: {result_url}" - logging.debug(f"[DEBUG] {message}") + logging.debug("[DEBUG] %s", message) self._display_text_on_node(message) self.final_response = response_obj if self.progress_extractor: @@ -925,7 +926,7 @@ class PollingOperation(Generic[T, R]): return self.final_response if status == TaskStatus.FAILED: message = f"Task failed: {json.dumps(resp)}" - logging.error(f"[DEBUG] {message}") + logging.error("[DEBUG] %s", message) raise Exception(message) logging.debug("[DEBUG] Task still pending, continuing to poll...") # Task pending – wait @@ -939,7 +940,12 @@ class PollingOperation(Generic[T, R]): raise Exception( f"Polling aborted after {consecutive_errors} network errors: {str(e)}" ) from e - logging.warning("Network error (%s/%s): %s", consecutive_errors, max_consecutive_errors, str(e)) + logging.warning( + "Network error (%s/%s): %s", + consecutive_errors, + max_consecutive_errors, + str(e), + ) await asyncio.sleep(self.poll_interval) except Exception as e: # For other errors, increment count and potentially abort @@ -949,10 +955,13 @@ class PollingOperation(Generic[T, R]): f"Polling aborted after {consecutive_errors} consecutive errors: {str(e)}" ) from e - logging.error(f"[DEBUG] Polling error: {str(e)}") + logging.error("[DEBUG] Polling error: %s", str(e)) logging.warning( - f"Error during polling (attempt {poll_count}/{self.max_poll_attempts}): {str(e)}. " - f"Will retry in {self.poll_interval} seconds." + "Error during polling (attempt %s/%s): %s. Will retry in %s seconds.", + poll_count, + self.max_poll_attempts, + str(e), + self.poll_interval, ) await asyncio.sleep(self.poll_interval) diff --git a/comfy_api_nodes/apis/request_logger.py b/comfy_api_nodes/apis/request_logger.py index 2e0ca5380..c6974d35c 100644 --- a/comfy_api_nodes/apis/request_logger.py +++ b/comfy_api_nodes/apis/request_logger.py @@ -21,7 +21,7 @@ def get_log_directory(): try: os.makedirs(log_dir, exist_ok=True) except Exception as e: - logger.error(f"Error creating API log directory {log_dir}: {e}") + logger.error("Error creating API log directory %s: %s", log_dir, str(e)) # Fallback to base temp directory if sub-directory creation fails return base_temp_dir return log_dir @@ -122,9 +122,9 @@ def log_request_response( try: with open(filepath, "w", encoding="utf-8") as f: f.write("\n".join(log_content)) - logger.debug(f"API log saved to: {filepath}") + logger.debug("API log saved to: %s", filepath) except Exception as e: - logger.error(f"Error writing API log to {filepath}: {e}") + logger.error("Error writing API log to %s: %s", filepath, str(e)) if __name__ == '__main__': diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index fe5b8562d..a3cd09786 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -296,7 +296,7 @@ def validate_video_result_response(response) -> None: """Validates that the Kling task result contains a video.""" if not is_valid_video_response(response): error_msg = f"Kling task {response.data.task_id} succeeded but no video data found in response." - logging.error(f"Error: {error_msg}.\nResponse: {response}") + logging.error("Error: %s.\nResponse: %s", error_msg, response) raise Exception(error_msg) @@ -304,7 +304,7 @@ def validate_image_result_response(response) -> None: """Validates that the Kling task result contains an image.""" if not is_valid_image_response(response): error_msg = f"Kling task {response.data.task_id} succeeded but no image data found in response." - logging.error(f"Error: {error_msg}.\nResponse: {response}") + logging.error("Error: %s.\nResponse: %s", error_msg, response) raise Exception(error_msg) diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index bf560661c..caa3d4260 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -500,7 +500,7 @@ class MinimaxHailuoVideoNode(comfy_io.ComfyNode): raise Exception( f"No video was found in the response. Full response: {file_result.model_dump()}" ) - logging.info(f"Generated video URL: {file_url}") + logging.info("Generated video URL: %s", file_url) if cls.hidden.unique_id: if hasattr(file_result.file, "backup_download_url"): message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}" diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 4a56d31f8..77e4b536c 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -237,7 +237,7 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: audio_stream = None for stream in input_container.streams: - logging.info(f"Found stream: type={stream.type}, class={type(stream)}") + logging.info("Found stream: type=%s, class=%s", stream.type, type(stream)) if isinstance(stream, av.VideoStream): # Create output video stream with same parameters video_stream = output_container.add_stream( @@ -247,7 +247,7 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: video_stream.height = stream.height video_stream.pix_fmt = "yuv420p" logging.info( - f"Added video stream: {stream.width}x{stream.height} @ {stream.average_rate}fps" + "Added video stream: %sx%s @ %sfps", stream.width, stream.height, stream.average_rate ) elif isinstance(stream, av.AudioStream): # Create output audio stream with same parameters @@ -256,9 +256,7 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: ) audio_stream.sample_rate = stream.sample_rate audio_stream.layout = stream.layout - logging.info( - f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels" - ) + logging.info("Added audio stream: %sHz, %s channels", stream.sample_rate, stream.channels) # Calculate target frame count that's divisible by 16 fps = input_container.streams.video[0].average_rate @@ -288,9 +286,7 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: for packet in video_stream.encode(): output_container.mux(packet) - logging.info( - f"Encoded {frame_count} video frames (target: {target_frames})" - ) + logging.info("Encoded %s video frames (target: %s)", frame_count, target_frames) # Decode and re-encode audio frames if audio_stream: @@ -308,7 +304,7 @@ def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: for packet in audio_stream.encode(): output_container.mux(packet) - logging.info(f"Encoded {audio_frame_count} audio frames") + logging.info("Encoded %s audio frames", audio_frame_count) # Close containers output_container.close() diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index bd758f762..0eb762a1c 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -172,16 +172,16 @@ async def create_generate_task( logging.info("[ Rodin3D API - Submit Jobs ] Submit Generate Task Success!") subscription_key = response.jobs.subscription_key task_uuid = response.uuid - logging.info(f"[ Rodin3D API - Submit Jobs ] UUID: {task_uuid}") + logging.info("[ Rodin3D API - Submit Jobs ] UUID: %s", task_uuid) return task_uuid, subscription_key def check_rodin_status(response: Rodin3DCheckStatusResponse) -> str: all_done = all(job.status == JobStatus.Done for job in response.jobs) status_list = [str(job.status) for job in response.jobs] - logging.info(f"[ Rodin3D API - CheckStatus ] Generate Status: {status_list}") + logging.info("[ Rodin3D API - CheckStatus ] Generate Status: %s", status_list) if any(job.status == JobStatus.Failed for job in response.jobs): - logging.error(f"[ Rodin3D API - CheckStatus ] Generate Failed: {status_list}, Please try again.") + logging.error("[ Rodin3D API - CheckStatus ] Generate Failed: %s, Please try again.", status_list) raise Exception("[ Rodin3D API ] Generate Failed, Please Try again.") if all_done: return "DONE" @@ -235,7 +235,7 @@ async def download_files(url_list, task_uuid): file_path = os.path.join(save_path, file_name) if file_path.endswith(".glb"): model_file_path = file_path - logging.info(f"[ Rodin3D API - download_files ] Downloading file: {file_path}") + logging.info("[ Rodin3D API - download_files ] Downloading file: %s", file_path) max_retries = 5 for attempt in range(max_retries): try: @@ -246,7 +246,7 @@ async def download_files(url_list, task_uuid): f.write(chunk) break except Exception as e: - logging.info(f"[ Rodin3D API - download_files ] Error downloading {file_path}:{e}") + logging.info("[ Rodin3D API - download_files ] Error downloading %s:%s", file_path, str(e)) if attempt < max_retries - 1: logging.info("Retrying...") await asyncio.sleep(2) diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index 251aecd42..9d5eced1e 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -215,7 +215,7 @@ class VeoVideoGenerationNode(comfy_io.ComfyNode): initial_response = await initial_operation.execute() operation_name = initial_response.name - logging.info(f"Veo generation started with operation name: {operation_name}") + logging.info("Veo generation started with operation name: %s", operation_name) # Define status extractor function def status_extractor(response): diff --git a/pyproject.toml b/pyproject.toml index 6ea839336..5dcc49a47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,6 @@ messages_control.disable = [ # next warnings should be fixed in future "bad-classmethod-argument", # Class method should have 'cls' as first argument "wrong-import-order", # Standard imports should be placed before third party imports - "logging-fstring-interpolation", # Use lazy % formatting in logging functions "ungrouped-imports", "unnecessary-pass", "unnecessary-lambda-assignment", From 2ba8d7cce8b6d78efa4b853ae8df187bb13061a3 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 9 Oct 2025 09:10:23 +0300 Subject: [PATCH 422/759] convert nodes_model_downscale.py to V3 schema (#10199) --- comfy_extras/nodes_model_downscale.py | 61 +++++++++++++++++---------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/comfy_extras/nodes_model_downscale.py b/comfy_extras/nodes_model_downscale.py index 49420dee9..f7ca9699d 100644 --- a/comfy_extras/nodes_model_downscale.py +++ b/comfy_extras/nodes_model_downscale.py @@ -1,24 +1,33 @@ +from typing_extensions import override import comfy.utils +from comfy_api.latest import ComfyExtension, io -class PatchModelAddDownscale: - upscale_methods = ["bicubic", "nearest-exact", "bilinear", "area", "bislerp"] + +class PatchModelAddDownscale(io.ComfyNode): + UPSCALE_METHODS = ["bicubic", "nearest-exact", "bilinear", "area", "bislerp"] @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "block_number": ("INT", {"default": 3, "min": 1, "max": 32, "step": 1}), - "downscale_factor": ("FLOAT", {"default": 2.0, "min": 0.1, "max": 9.0, "step": 0.001}), - "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}), - "end_percent": ("FLOAT", {"default": 0.35, "min": 0.0, "max": 1.0, "step": 0.001}), - "downscale_after_skip": ("BOOLEAN", {"default": True}), - "downscale_method": (s.upscale_methods,), - "upscale_method": (s.upscale_methods,), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="PatchModelAddDownscale", + display_name="PatchModelAddDownscale (Kohya Deep Shrink)", + category="model_patches/unet", + inputs=[ + io.Model.Input("model"), + io.Int.Input("block_number", default=3, min=1, max=32, step=1), + io.Float.Input("downscale_factor", default=2.0, min=0.1, max=9.0, step=0.001), + io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=0.35, min=0.0, max=1.0, step=0.001), + io.Boolean.Input("downscale_after_skip", default=True), + io.Combo.Input("downscale_method", options=cls.UPSCALE_METHODS), + io.Combo.Input("upscale_method", options=cls.UPSCALE_METHODS), + ], + outputs=[ + io.Model.Output(), + ], + ) - CATEGORY = "model_patches/unet" - - def patch(self, model, block_number, downscale_factor, start_percent, end_percent, downscale_after_skip, downscale_method, upscale_method): + @classmethod + def execute(cls, model, block_number, downscale_factor, start_percent, end_percent, downscale_after_skip, downscale_method, upscale_method) -> io.NodeOutput: model_sampling = model.get_model_object("model_sampling") sigma_start = model_sampling.percent_to_sigma(start_percent) sigma_end = model_sampling.percent_to_sigma(end_percent) @@ -41,13 +50,21 @@ class PatchModelAddDownscale: else: m.set_model_input_block_patch(input_block_patch) m.set_model_output_block_patch(output_block_patch) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "PatchModelAddDownscale": PatchModelAddDownscale, -} NODE_DISPLAY_NAME_MAPPINGS = { # Sampling - "PatchModelAddDownscale": "PatchModelAddDownscale (Kohya Deep Shrink)", + "PatchModelAddDownscale": "", } + +class ModelDownscaleExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + PatchModelAddDownscale, + ] + + +async def comfy_entrypoint() -> ModelDownscaleExtension: + return ModelDownscaleExtension() From 989f715d92678e02b0a2db948e0610027cee7d96 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 9 Oct 2025 09:11:45 +0300 Subject: [PATCH 423/759] convert nodes_lora_extract.py to V3 schema (#10182) --- comfy_extras/nodes_lora_extract.py | 70 ++++++++++++++++++------------ 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/comfy_extras/nodes_lora_extract.py b/comfy_extras/nodes_lora_extract.py index dfd4fe9f4..a2375cba7 100644 --- a/comfy_extras/nodes_lora_extract.py +++ b/comfy_extras/nodes_lora_extract.py @@ -5,6 +5,8 @@ import folder_paths import os import logging from enum import Enum +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io CLAMP_QUANTILE = 0.99 @@ -71,32 +73,40 @@ def calc_lora_model(model_diff, rank, prefix_model, prefix_lora, output_sd, lora output_sd["{}{}.diff_b".format(prefix_lora, k[len(prefix_model):-5])] = sd[k].contiguous().half().cpu() return output_sd -class LoraSave: - def __init__(self): - self.output_dir = folder_paths.get_output_directory() +class LoraSave(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="LoraSave", + display_name="Extract and Save Lora", + category="_for_testing", + inputs=[ + io.String.Input("filename_prefix", default="loras/ComfyUI_extracted_lora"), + io.Int.Input("rank", default=8, min=1, max=4096, step=1), + io.Combo.Input("lora_type", options=tuple(LORA_TYPES.keys())), + io.Boolean.Input("bias_diff", default=True), + io.Model.Input( + "model_diff", + tooltip="The ModelSubtract output to be converted to a lora.", + optional=True, + ), + io.Clip.Input( + "text_encoder_diff", + tooltip="The CLIPSubtract output to be converted to a lora.", + optional=True, + ), + ], + is_experimental=True, + is_output_node=True, + ) @classmethod - def INPUT_TYPES(s): - return {"required": {"filename_prefix": ("STRING", {"default": "loras/ComfyUI_extracted_lora"}), - "rank": ("INT", {"default": 8, "min": 1, "max": 4096, "step": 1}), - "lora_type": (tuple(LORA_TYPES.keys()),), - "bias_diff": ("BOOLEAN", {"default": True}), - }, - "optional": {"model_diff": ("MODEL", {"tooltip": "The ModelSubtract output to be converted to a lora."}), - "text_encoder_diff": ("CLIP", {"tooltip": "The CLIPSubtract output to be converted to a lora."})}, - } - RETURN_TYPES = () - FUNCTION = "save" - OUTPUT_NODE = True - - CATEGORY = "_for_testing" - - def save(self, filename_prefix, rank, lora_type, bias_diff, model_diff=None, text_encoder_diff=None): + def execute(cls, filename_prefix, rank, lora_type, bias_diff, model_diff=None, text_encoder_diff=None) -> io.NodeOutput: if model_diff is None and text_encoder_diff is None: - return {} + return io.NodeOutput() lora_type = LORA_TYPES.get(lora_type) - full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir) + full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, folder_paths.get_output_directory()) output_sd = {} if model_diff is not None: @@ -108,12 +118,16 @@ class LoraSave: output_checkpoint = os.path.join(full_output_folder, output_checkpoint) comfy.utils.save_torch_file(output_sd, output_checkpoint, metadata=None) - return {} + return io.NodeOutput() -NODE_CLASS_MAPPINGS = { - "LoraSave": LoraSave -} -NODE_DISPLAY_NAME_MAPPINGS = { - "LoraSave": "Extract and Save Lora" -} +class LoraSaveExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + LoraSave, + ] + + +async def comfy_entrypoint() -> LoraSaveExtension: + return LoraSaveExtension() From 6732014a0a99e85389e5c32e87bdff9e31cdcfd1 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 9 Oct 2025 09:13:15 +0300 Subject: [PATCH 424/759] convert nodes_compositing.py to V3 schema (#10174) --- comfy_extras/nodes_compositing.py | 123 ++++++++++++++++-------------- 1 file changed, 66 insertions(+), 57 deletions(-) diff --git a/comfy_extras/nodes_compositing.py b/comfy_extras/nodes_compositing.py index 2f994fa11..e4e4e1cbc 100644 --- a/comfy_extras/nodes_compositing.py +++ b/comfy_extras/nodes_compositing.py @@ -1,6 +1,9 @@ import torch import comfy.utils from enum import Enum +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io + def resize_mask(mask, shape): return torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(shape[0], shape[1]), mode="bilinear").squeeze(1) @@ -101,24 +104,28 @@ def porter_duff_composite(src_image: torch.Tensor, src_alpha: torch.Tensor, dst_ return out_image, out_alpha -class PorterDuffImageComposite: +class PorterDuffImageComposite(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "source": ("IMAGE",), - "source_alpha": ("MASK",), - "destination": ("IMAGE",), - "destination_alpha": ("MASK",), - "mode": ([mode.name for mode in PorterDuffMode], {"default": PorterDuffMode.DST.name}), - }, - } + def define_schema(cls): + return io.Schema( + node_id="PorterDuffImageComposite", + display_name="Porter-Duff Image Composite", + category="mask/compositing", + inputs=[ + io.Image.Input("source"), + io.Mask.Input("source_alpha"), + io.Image.Input("destination"), + io.Mask.Input("destination_alpha"), + io.Combo.Input("mode", options=[mode.name for mode in PorterDuffMode], default=PorterDuffMode.DST.name), + ], + outputs=[ + io.Image.Output(), + io.Mask.Output(), + ], + ) - RETURN_TYPES = ("IMAGE", "MASK") - FUNCTION = "composite" - CATEGORY = "mask/compositing" - - def composite(self, source: torch.Tensor, source_alpha: torch.Tensor, destination: torch.Tensor, destination_alpha: torch.Tensor, mode): + @classmethod + def execute(cls, source: torch.Tensor, source_alpha: torch.Tensor, destination: torch.Tensor, destination_alpha: torch.Tensor, mode) -> io.NodeOutput: batch_size = min(len(source), len(source_alpha), len(destination), len(destination_alpha)) out_images = [] out_alphas = [] @@ -150,45 +157,48 @@ class PorterDuffImageComposite: out_images.append(out_image) out_alphas.append(out_alpha.squeeze(2)) - result = (torch.stack(out_images), torch.stack(out_alphas)) - return result + return io.NodeOutput(torch.stack(out_images), torch.stack(out_alphas)) -class SplitImageWithAlpha: +class SplitImageWithAlpha(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - } - } + def define_schema(cls): + return io.Schema( + node_id="SplitImageWithAlpha", + display_name="Split Image with Alpha", + category="mask/compositing", + inputs=[ + io.Image.Input("image"), + ], + outputs=[ + io.Image.Output(), + io.Mask.Output(), + ], + ) - CATEGORY = "mask/compositing" - RETURN_TYPES = ("IMAGE", "MASK") - FUNCTION = "split_image_with_alpha" - - def split_image_with_alpha(self, image: torch.Tensor): + @classmethod + def execute(cls, image: torch.Tensor) -> io.NodeOutput: out_images = [i[:,:,:3] for i in image] out_alphas = [i[:,:,3] if i.shape[2] > 3 else torch.ones_like(i[:,:,0]) for i in image] - result = (torch.stack(out_images), 1.0 - torch.stack(out_alphas)) - return result + return io.NodeOutput(torch.stack(out_images), 1.0 - torch.stack(out_alphas)) -class JoinImageWithAlpha: +class JoinImageWithAlpha(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - "alpha": ("MASK",), - } - } + def define_schema(cls): + return io.Schema( + node_id="JoinImageWithAlpha", + display_name="Join Image with Alpha", + category="mask/compositing", + inputs=[ + io.Image.Input("image"), + io.Mask.Input("alpha"), + ], + outputs=[io.Image.Output()], + ) - CATEGORY = "mask/compositing" - RETURN_TYPES = ("IMAGE",) - FUNCTION = "join_image_with_alpha" - - def join_image_with_alpha(self, image: torch.Tensor, alpha: torch.Tensor): + @classmethod + def execute(cls, image: torch.Tensor, alpha: torch.Tensor) -> io.NodeOutput: batch_size = min(len(image), len(alpha)) out_images = [] @@ -196,19 +206,18 @@ class JoinImageWithAlpha: for i in range(batch_size): out_images.append(torch.cat((image[i][:,:,:3], alpha[i].unsqueeze(2)), dim=2)) - result = (torch.stack(out_images),) - return result + return io.NodeOutput(torch.stack(out_images)) -NODE_CLASS_MAPPINGS = { - "PorterDuffImageComposite": PorterDuffImageComposite, - "SplitImageWithAlpha": SplitImageWithAlpha, - "JoinImageWithAlpha": JoinImageWithAlpha, -} +class CompositingExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + PorterDuffImageComposite, + SplitImageWithAlpha, + JoinImageWithAlpha, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "PorterDuffImageComposite": "Porter-Duff Image Composite", - "SplitImageWithAlpha": "Split Image with Alpha", - "JoinImageWithAlpha": "Join Image with Alpha", -} +async def comfy_entrypoint() -> CompositingExtension: + return CompositingExtension() From cbee7d33909f168a08ab7e53d897ea284a304d84 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 9 Oct 2025 09:14:00 +0300 Subject: [PATCH 425/759] convert nodes_latent.py to V3 schema (#10160) --- comfy_extras/nodes_latent.py | 394 ++++++++++++++++++++--------------- 1 file changed, 224 insertions(+), 170 deletions(-) diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py index 0f90cf60c..d2df07ff9 100644 --- a/comfy_extras/nodes_latent.py +++ b/comfy_extras/nodes_latent.py @@ -2,6 +2,8 @@ import comfy.utils import comfy_extras.nodes_post_processing import torch import nodes +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io def reshape_latent_to(target_shape, latent, repeat_batch=True): @@ -13,17 +15,23 @@ def reshape_latent_to(target_shape, latent, repeat_batch=True): return latent -class LatentAdd: +class LatentAdd(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples1": ("LATENT",), "samples2": ("LATENT",)}} + def define_schema(cls): + return io.Schema( + node_id="LatentAdd", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples1"), + io.Latent.Input("samples2"), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples1, samples2): + @classmethod + def execute(cls, samples1, samples2) -> io.NodeOutput: samples_out = samples1.copy() s1 = samples1["samples"] @@ -31,19 +39,25 @@ class LatentAdd: s2 = reshape_latent_to(s1.shape, s2) samples_out["samples"] = s1 + s2 - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentSubtract: +class LatentSubtract(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples1": ("LATENT",), "samples2": ("LATENT",)}} + def define_schema(cls): + return io.Schema( + node_id="LatentSubtract", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples1"), + io.Latent.Input("samples2"), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples1, samples2): + @classmethod + def execute(cls, samples1, samples2) -> io.NodeOutput: samples_out = samples1.copy() s1 = samples1["samples"] @@ -51,41 +65,49 @@ class LatentSubtract: s2 = reshape_latent_to(s1.shape, s2) samples_out["samples"] = s1 - s2 - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentMultiply: +class LatentMultiply(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples": ("LATENT",), - "multiplier": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}), - }} + def define_schema(cls): + return io.Schema( + node_id="LatentMultiply", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples"), + io.Float.Input("multiplier", default=1.0, min=-10.0, max=10.0, step=0.01), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples, multiplier): + @classmethod + def execute(cls, samples, multiplier) -> io.NodeOutput: samples_out = samples.copy() s1 = samples["samples"] samples_out["samples"] = s1 * multiplier - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentInterpolate: +class LatentInterpolate(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples1": ("LATENT",), - "samples2": ("LATENT",), - "ratio": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - }} + def define_schema(cls): + return io.Schema( + node_id="LatentInterpolate", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples1"), + io.Latent.Input("samples2"), + io.Float.Input("ratio", default=1.0, min=0.0, max=1.0, step=0.01), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples1, samples2, ratio): + @classmethod + def execute(cls, samples1, samples2, ratio) -> io.NodeOutput: samples_out = samples1.copy() s1 = samples1["samples"] @@ -104,19 +126,26 @@ class LatentInterpolate: st = torch.nan_to_num(t / mt) samples_out["samples"] = st * (m1 * ratio + m2 * (1.0 - ratio)) - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentConcat: +class LatentConcat(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples1": ("LATENT",), "samples2": ("LATENT",), "dim": (["x", "-x", "y", "-y", "t", "-t"], )}} + def define_schema(cls): + return io.Schema( + node_id="LatentConcat", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples1"), + io.Latent.Input("samples2"), + io.Combo.Input("dim", options=["x", "-x", "y", "-y", "t", "-t"]), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples1, samples2, dim): + @classmethod + def execute(cls, samples1, samples2, dim) -> io.NodeOutput: samples_out = samples1.copy() s1 = samples1["samples"] @@ -136,22 +165,27 @@ class LatentConcat: dim = -3 samples_out["samples"] = torch.cat(c, dim=dim) - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentCut: +class LatentCut(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"samples": ("LATENT",), - "dim": (["x", "y", "t"], ), - "index": ("INT", {"default": 0, "min": -nodes.MAX_RESOLUTION, "max": nodes.MAX_RESOLUTION, "step": 1}), - "amount": ("INT", {"default": 1, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 1})}} + def define_schema(cls): + return io.Schema( + node_id="LatentCut", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples"), + io.Combo.Input("dim", options=["x", "y", "t"]), + io.Int.Input("index", default=0, min=-nodes.MAX_RESOLUTION, max=nodes.MAX_RESOLUTION, step=1), + io.Int.Input("amount", default=1, min=1, max=nodes.MAX_RESOLUTION, step=1), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples, dim, index, amount): + @classmethod + def execute(cls, samples, dim, index, amount) -> io.NodeOutput: samples_out = samples.copy() s1 = samples["samples"] @@ -171,19 +205,25 @@ class LatentCut: amount = min(-index, amount) samples_out["samples"] = torch.narrow(s1, dim, index, amount) - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentBatch: +class LatentBatch(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples1": ("LATENT",), "samples2": ("LATENT",)}} + def define_schema(cls): + return io.Schema( + node_id="LatentBatch", + category="latent/batch", + inputs=[ + io.Latent.Input("samples1"), + io.Latent.Input("samples2"), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "batch" - - CATEGORY = "latent/batch" - - def batch(self, samples1, samples2): + @classmethod + def execute(cls, samples1, samples2) -> io.NodeOutput: samples_out = samples1.copy() s1 = samples1["samples"] s2 = samples2["samples"] @@ -192,20 +232,25 @@ class LatentBatch: s = torch.cat((s1, s2), dim=0) samples_out["samples"] = s samples_out["batch_index"] = samples1.get("batch_index", [x for x in range(0, s1.shape[0])]) + samples2.get("batch_index", [x for x in range(0, s2.shape[0])]) - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentBatchSeedBehavior: +class LatentBatchSeedBehavior(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples": ("LATENT",), - "seed_behavior": (["random", "fixed"],{"default": "fixed"}),}} + def define_schema(cls): + return io.Schema( + node_id="LatentBatchSeedBehavior", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples"), + io.Combo.Input("seed_behavior", options=["random", "fixed"], default="fixed"), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples, seed_behavior): + @classmethod + def execute(cls, samples, seed_behavior) -> io.NodeOutput: samples_out = samples.copy() latent = samples["samples"] if seed_behavior == "random": @@ -215,41 +260,50 @@ class LatentBatchSeedBehavior: batch_number = samples_out.get("batch_index", [0])[0] samples_out["batch_index"] = [batch_number] * latent.shape[0] - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentApplyOperation: +class LatentApplyOperation(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples": ("LATENT",), - "operation": ("LATENT_OPERATION",), - }} + def define_schema(cls): + return io.Schema( + node_id="LatentApplyOperation", + category="latent/advanced/operations", + is_experimental=True, + inputs=[ + io.Latent.Input("samples"), + io.LatentOperation.Input("operation"), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced/operations" - EXPERIMENTAL = True - - def op(self, samples, operation): + @classmethod + def execute(cls, samples, operation) -> io.NodeOutput: samples_out = samples.copy() s1 = samples["samples"] samples_out["samples"] = operation(latent=s1) - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentApplyOperationCFG: +class LatentApplyOperationCFG(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "operation": ("LATENT_OPERATION",), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="LatentApplyOperationCFG", + category="latent/advanced/operations", + is_experimental=True, + inputs=[ + io.Model.Input("model"), + io.LatentOperation.Input("operation"), + ], + outputs=[ + io.Model.Output(), + ], + ) - CATEGORY = "latent/advanced/operations" - EXPERIMENTAL = True - - def patch(self, model, operation): + @classmethod + def execute(cls, model, operation) -> io.NodeOutput: m = model.clone() def pre_cfg_function(args): @@ -261,21 +315,25 @@ class LatentApplyOperationCFG: return conds_out m.set_model_sampler_pre_cfg_function(pre_cfg_function) - return (m, ) + return io.NodeOutput(m) -class LatentOperationTonemapReinhard: +class LatentOperationTonemapReinhard(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "multiplier": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}), - }} + def define_schema(cls): + return io.Schema( + node_id="LatentOperationTonemapReinhard", + category="latent/advanced/operations", + is_experimental=True, + inputs=[ + io.Float.Input("multiplier", default=1.0, min=0.0, max=100.0, step=0.01), + ], + outputs=[ + io.LatentOperation.Output(), + ], + ) - RETURN_TYPES = ("LATENT_OPERATION",) - FUNCTION = "op" - - CATEGORY = "latent/advanced/operations" - EXPERIMENTAL = True - - def op(self, multiplier): + @classmethod + def execute(cls, multiplier) -> io.NodeOutput: def tonemap_reinhard(latent, **kwargs): latent_vector_magnitude = (torch.linalg.vector_norm(latent, dim=(1)) + 0.0000000001)[:,None] normalized_latent = latent / latent_vector_magnitude @@ -291,39 +349,27 @@ class LatentOperationTonemapReinhard: new_magnitude *= top return normalized_latent * new_magnitude - return (tonemap_reinhard,) + return io.NodeOutput(tonemap_reinhard) -class LatentOperationSharpen: +class LatentOperationSharpen(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "sharpen_radius": ("INT", { - "default": 9, - "min": 1, - "max": 31, - "step": 1 - }), - "sigma": ("FLOAT", { - "default": 1.0, - "min": 0.1, - "max": 10.0, - "step": 0.1 - }), - "alpha": ("FLOAT", { - "default": 0.1, - "min": 0.0, - "max": 5.0, - "step": 0.01 - }), - }} + def define_schema(cls): + return io.Schema( + node_id="LatentOperationSharpen", + category="latent/advanced/operations", + is_experimental=True, + inputs=[ + io.Int.Input("sharpen_radius", default=9, min=1, max=31, step=1), + io.Float.Input("sigma", default=1.0, min=0.1, max=10.0, step=0.1), + io.Float.Input("alpha", default=0.1, min=0.0, max=5.0, step=0.01), + ], + outputs=[ + io.LatentOperation.Output(), + ], + ) - RETURN_TYPES = ("LATENT_OPERATION",) - FUNCTION = "op" - - CATEGORY = "latent/advanced/operations" - EXPERIMENTAL = True - - def op(self, sharpen_radius, sigma, alpha): + @classmethod + def execute(cls, sharpen_radius, sigma, alpha) -> io.NodeOutput: def sharpen(latent, **kwargs): luminance = (torch.linalg.vector_norm(latent, dim=(1)) + 1e-6)[:,None] normalized_latent = latent / luminance @@ -340,19 +386,27 @@ class LatentOperationSharpen: sharpened = torch.nn.functional.conv2d(padded_image, kernel.repeat(channels, 1, 1).unsqueeze(1), padding=kernel_size // 2, groups=channels)[:,:,sharpen_radius:-sharpen_radius, sharpen_radius:-sharpen_radius] return luminance * sharpened - return (sharpen,) + return io.NodeOutput(sharpen) -NODE_CLASS_MAPPINGS = { - "LatentAdd": LatentAdd, - "LatentSubtract": LatentSubtract, - "LatentMultiply": LatentMultiply, - "LatentInterpolate": LatentInterpolate, - "LatentConcat": LatentConcat, - "LatentCut": LatentCut, - "LatentBatch": LatentBatch, - "LatentBatchSeedBehavior": LatentBatchSeedBehavior, - "LatentApplyOperation": LatentApplyOperation, - "LatentApplyOperationCFG": LatentApplyOperationCFG, - "LatentOperationTonemapReinhard": LatentOperationTonemapReinhard, - "LatentOperationSharpen": LatentOperationSharpen, -} + +class LatentExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + LatentAdd, + LatentSubtract, + LatentMultiply, + LatentInterpolate, + LatentConcat, + LatentCut, + LatentBatch, + LatentBatchSeedBehavior, + LatentApplyOperation, + LatentApplyOperationCFG, + LatentOperationTonemapReinhard, + LatentOperationSharpen, + ] + + +async def comfy_entrypoint() -> LatentExtension: + return LatentExtension() From 139addd53c6cab97fb0ac28d1c895b3ecc7dff6c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 9 Oct 2025 13:37:35 -0700 Subject: [PATCH 426/759] More surgical fix for #10267 (#10276) --- comfy/model_patcher.py | 28 +++++++++++++++++++++------- comfy/ops.py | 4 +++- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 1fd03d9d1..e8c859689 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -123,16 +123,26 @@ def move_weight_functions(m, device): return memory class LowVramPatch: - def __init__(self, key, patches): + def __init__(self, key, patches, convert_func=None, set_func=None): self.key = key self.patches = patches + self.convert_func = convert_func + self.set_func = set_func + def __call__(self, weight): + if self.convert_func is not None: + weight = self.convert_func(weight.to(dtype=torch.float32, copy=True), inplace=True) + intermediate_dtype = weight.dtype - if intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops + if self.set_func is None and intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops intermediate_dtype = torch.float32 return comfy.float.stochastic_rounding(comfy.lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype), weight.dtype, seed=string_to_seed(self.key)) - return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype) + out = comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype) + if self.set_func is not None: + return self.set_func(out, seed=string_to_seed(self.key), return_weight=True) + else: + return out def get_key_weight(model, key): set_func = None @@ -657,13 +667,15 @@ class ModelPatcher: if force_patch_weights: self.patch_weight_to_device(weight_key) else: - m.weight_function = [LowVramPatch(weight_key, self.patches)] + _, set_func, convert_func = get_key_weight(self.model, weight_key) + m.weight_function = [LowVramPatch(weight_key, self.patches, convert_func, set_func)] patch_counter += 1 if bias_key in self.patches: if force_patch_weights: self.patch_weight_to_device(bias_key) else: - m.bias_function = [LowVramPatch(bias_key, self.patches)] + _, set_func, convert_func = get_key_weight(self.model, bias_key) + m.bias_function = [LowVramPatch(bias_key, self.patches, convert_func, set_func)] patch_counter += 1 cast_weight = True @@ -825,10 +837,12 @@ class ModelPatcher: module_mem += move_weight_functions(m, device_to) if lowvram_possible: if weight_key in self.patches: - m.weight_function.append(LowVramPatch(weight_key, self.patches)) + _, set_func, convert_func = get_key_weight(self.model, weight_key) + m.weight_function.append(LowVramPatch(weight_key, self.patches, convert_func, set_func)) patch_counter += 1 if bias_key in self.patches: - m.bias_function.append(LowVramPatch(bias_key, self.patches)) + _, set_func, convert_func = get_key_weight(self.model, bias_key) + m.bias_function.append(LowVramPatch(bias_key, self.patches, convert_func, set_func)) patch_counter += 1 cast_weight = True diff --git a/comfy/ops.py b/comfy/ops.py index 9d7dedd37..2415c96bf 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -416,8 +416,10 @@ def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None else: return weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype) - def set_weight(self, weight, inplace_update=False, seed=None, **kwargs): + def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs): weight = comfy.float.stochastic_rounding(weight / self.scale_weight.to(device=weight.device, dtype=weight.dtype), self.weight.dtype, seed=seed) + if return_weight: + return weight if inplace_update: self.weight.data.copy_(weight) else: From f3d5d328a39d2f264b35d43f0e9c5a0b4d780c2f Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 10 Oct 2025 01:15:03 +0300 Subject: [PATCH 427/759] fix(v3,api-nodes): V3 schema typing; corrected Pika API nodes (#10265) --- comfy_api/latest/__init__.py | 9 +- comfy_api/latest/_input/video_types.py | 4 +- comfy_api/latest/_io.py | 146 ++++++------- comfy_api/latest/_ui.py | 25 +-- comfy_api_nodes/apinode_utils.py | 2 +- comfy_api_nodes/apis/client.py | 58 +++--- comfy_api_nodes/apis/pika_defs.py | 100 +++++++++ comfy_api_nodes/nodes_pika.py | 277 ++++++++----------------- 8 files changed, 309 insertions(+), 312 deletions(-) create mode 100644 comfy_api_nodes/apis/pika_defs.py diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py index 2cee65aa9..b19a97f1d 100644 --- a/comfy_api/latest/__init__.py +++ b/comfy_api/latest/__init__.py @@ -8,8 +8,8 @@ from comfy_api.internal.async_to_sync import create_sync_class from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents -from comfy_api.latest._io import _IO as io #noqa: F401 -from comfy_api.latest._ui import _UI as ui #noqa: F401 +from . import _io as io +from . import _ui as ui # from comfy_api.latest._resources import _RESOURCES as resources #noqa: F401 from comfy_execution.utils import get_executing_context from comfy_execution.progress import get_progress_state, PreviewImageTuple @@ -114,6 +114,8 @@ if TYPE_CHECKING: ComfyAPISync: Type[comfy_api.latest.generated.ComfyAPISyncStub.ComfyAPISyncStub] ComfyAPISync = create_sync_class(ComfyAPI_latest) +comfy_io = io # create the new alias for io + __all__ = [ "ComfyAPI", "ComfyAPISync", @@ -121,4 +123,7 @@ __all__ = [ "InputImpl", "Types", "ComfyExtension", + "io", + "comfy_io", + "ui", ] diff --git a/comfy_api/latest/_input/video_types.py b/comfy_api/latest/_input/video_types.py index 5d95dc507..a335df4d0 100644 --- a/comfy_api/latest/_input/video_types.py +++ b/comfy_api/latest/_input/video_types.py @@ -1,6 +1,6 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Optional, Union +from typing import Optional, Union, IO import io import av from comfy_api.util import VideoContainer, VideoCodec, VideoComponents @@ -23,7 +23,7 @@ class VideoInput(ABC): @abstractmethod def save_to( self, - path: str, + path: Union[str, IO[bytes]], format: VideoContainer = VideoContainer.AUTO, codec: VideoCodec = VideoCodec.AUTO, metadata: Optional[dict] = None diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 661309f19..0b701260f 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -1582,78 +1582,78 @@ class _UIOutput(ABC): ... -class _IO: - FolderType = FolderType - UploadType = UploadType - RemoteOptions = RemoteOptions - NumberDisplay = NumberDisplay +__all__ = [ + "FolderType", + "UploadType", + "RemoteOptions", + "NumberDisplay", - comfytype = staticmethod(comfytype) - Custom = staticmethod(Custom) - Input = Input - WidgetInput = WidgetInput - Output = Output - ComfyTypeI = ComfyTypeI - ComfyTypeIO = ComfyTypeIO - #--------------------------------- + "comfytype", + "Custom", + "Input", + "WidgetInput", + "Output", + "ComfyTypeI", + "ComfyTypeIO", # Supported Types - Boolean = Boolean - Int = Int - Float = Float - String = String - Combo = Combo - MultiCombo = MultiCombo - Image = Image - WanCameraEmbedding = WanCameraEmbedding - Webcam = Webcam - Mask = Mask - Latent = Latent - Conditioning = Conditioning - Sampler = Sampler - Sigmas = Sigmas - Noise = Noise - Guider = Guider - Clip = Clip - ControlNet = ControlNet - Vae = Vae - Model = Model - ClipVision = ClipVision - ClipVisionOutput = ClipVisionOutput - AudioEncoder = AudioEncoder - AudioEncoderOutput = AudioEncoderOutput - StyleModel = StyleModel - Gligen = Gligen - UpscaleModel = UpscaleModel - Audio = Audio - Video = Video - SVG = SVG - LoraModel = LoraModel - LossMap = LossMap - Voxel = Voxel - Mesh = Mesh - Hooks = Hooks - HookKeyframes = HookKeyframes - TimestepsRange = TimestepsRange - LatentOperation = LatentOperation - FlowControl = FlowControl - Accumulation = Accumulation - Load3DCamera = Load3DCamera - Load3D = Load3D - Load3DAnimation = Load3DAnimation - Photomaker = Photomaker - Point = Point - FaceAnalysis = FaceAnalysis - BBOX = BBOX - SEGS = SEGS - AnyType = AnyType - MultiType = MultiType - #--------------------------------- - HiddenHolder = HiddenHolder - Hidden = Hidden - NodeInfoV1 = NodeInfoV1 - NodeInfoV3 = NodeInfoV3 - Schema = Schema - ComfyNode = ComfyNode - NodeOutput = NodeOutput - add_to_dict_v1 = staticmethod(add_to_dict_v1) - add_to_dict_v3 = staticmethod(add_to_dict_v3) + "Boolean", + "Int", + "Float", + "String", + "Combo", + "MultiCombo", + "Image", + "WanCameraEmbedding", + "Webcam", + "Mask", + "Latent", + "Conditioning", + "Sampler", + "Sigmas", + "Noise", + "Guider", + "Clip", + "ControlNet", + "Vae", + "Model", + "ClipVision", + "ClipVisionOutput", + "AudioEncoder", + "AudioEncoderOutput", + "StyleModel", + "Gligen", + "UpscaleModel", + "Audio", + "Video", + "SVG", + "LoraModel", + "LossMap", + "Voxel", + "Mesh", + "Hooks", + "HookKeyframes", + "TimestepsRange", + "LatentOperation", + "FlowControl", + "Accumulation", + "Load3DCamera", + "Load3D", + "Load3DAnimation", + "Photomaker", + "Point", + "FaceAnalysis", + "BBOX", + "SEGS", + "AnyType", + "MultiType", + # Other classes + "HiddenHolder", + "Hidden", + "NodeInfoV1", + "NodeInfoV3", + "Schema", + "ComfyNode", + "NodeOutput", + "add_to_dict_v1", + "add_to_dict_v3", +] diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py index 26a55615f..b0bbabe2a 100644 --- a/comfy_api/latest/_ui.py +++ b/comfy_api/latest/_ui.py @@ -449,15 +449,16 @@ class PreviewText(_UIOutput): return {"text": (self.value,)} -class _UI: - SavedResult = SavedResult - SavedImages = SavedImages - SavedAudios = SavedAudios - ImageSaveHelper = ImageSaveHelper - AudioSaveHelper = AudioSaveHelper - PreviewImage = PreviewImage - PreviewMask = PreviewMask - PreviewAudio = PreviewAudio - PreviewVideo = PreviewVideo - PreviewUI3D = PreviewUI3D - PreviewText = PreviewText +__all__ = [ + "SavedResult", + "SavedImages", + "SavedAudios", + "ImageSaveHelper", + "AudioSaveHelper", + "PreviewImage", + "PreviewMask", + "PreviewAudio", + "PreviewVideo", + "PreviewUI3D", + "PreviewText", +] diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py index 2e0dc4dc1..4bab539f7 100644 --- a/comfy_api_nodes/apinode_utils.py +++ b/comfy_api_nodes/apinode_utils.py @@ -269,7 +269,7 @@ def tensor_to_bytesio( mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4'). Returns: - Named BytesIO object containing the image data. + Named BytesIO object containing the image data, with pointer set to the start of buffer. """ if not mime_type: mime_type = "image/png" diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py index a3ceafbae..e08dfb093 100644 --- a/comfy_api_nodes/apis/client.py +++ b/comfy_api_nodes/apis/client.py @@ -98,7 +98,7 @@ import io import os import socket from aiohttp.client_exceptions import ClientError, ClientResponseError -from typing import Dict, Type, Optional, Any, TypeVar, Generic, Callable, Tuple +from typing import Type, Optional, Any, TypeVar, Generic, Callable from enum import Enum import json from urllib.parse import urljoin, urlparse @@ -175,7 +175,7 @@ class ApiClient: max_retries: int = 3, retry_delay: float = 1.0, retry_backoff_factor: float = 2.0, - retry_status_codes: Optional[Tuple[int, ...]] = None, + retry_status_codes: Optional[tuple[int, ...]] = None, session: Optional[aiohttp.ClientSession] = None, ): self.base_url = base_url @@ -199,9 +199,9 @@ class ApiClient: @staticmethod def _create_json_payload_args( - data: Optional[Dict[str, Any]] = None, - headers: Optional[Dict[str, str]] = None, - ) -> Dict[str, Any]: + data: Optional[dict[str, Any]] = None, + headers: Optional[dict[str, str]] = None, + ) -> dict[str, Any]: return { "json": data, "headers": headers, @@ -209,11 +209,11 @@ class ApiClient: def _create_form_data_args( self, - data: Dict[str, Any] | None, - files: Dict[str, Any] | None, - headers: Optional[Dict[str, str]] = None, + data: dict[str, Any] | None, + files: dict[str, Any] | None, + headers: Optional[dict[str, str]] = None, multipart_parser: Callable | None = None, - ) -> Dict[str, Any]: + ) -> dict[str, Any]: if headers and "Content-Type" in headers: del headers["Content-Type"] @@ -254,9 +254,9 @@ class ApiClient: @staticmethod def _create_urlencoded_form_data_args( - data: Dict[str, Any], - headers: Optional[Dict[str, str]] = None, - ) -> Dict[str, Any]: + data: dict[str, Any], + headers: Optional[dict[str, str]] = None, + ) -> dict[str, Any]: headers = headers or {} headers["Content-Type"] = "application/x-www-form-urlencoded" return { @@ -264,7 +264,7 @@ class ApiClient: "headers": headers, } - def get_headers(self) -> Dict[str, str]: + def get_headers(self) -> dict[str, str]: """Get headers for API requests, including authentication if available""" headers = {"Content-Type": "application/json", "Accept": "application/json"} @@ -275,7 +275,7 @@ class ApiClient: return headers - async def _check_connectivity(self, target_url: str) -> Dict[str, bool]: + async def _check_connectivity(self, target_url: str) -> dict[str, bool]: """ Check connectivity to determine if network issues are local or server-related. @@ -316,14 +316,14 @@ class ApiClient: self, method: str, path: str, - params: Optional[Dict[str, Any]] = None, - data: Optional[Dict[str, Any]] = None, - files: Optional[Dict[str, Any] | list[tuple[str, Any]]] = None, - headers: Optional[Dict[str, str]] = None, + params: Optional[dict[str, Any]] = None, + data: Optional[dict[str, Any]] = None, + files: Optional[dict[str, Any] | list[tuple[str, Any]]] = None, + headers: Optional[dict[str, str]] = None, content_type: str = "application/json", multipart_parser: Callable | None = None, retry_count: int = 0, # Used internally for tracking retries - ) -> Dict[str, Any]: + ) -> dict[str, Any]: """ Make an HTTP request to the API with automatic retries for transient errors. @@ -485,7 +485,7 @@ class ApiClient: retry_delay: Initial delay between retries in seconds retry_backoff_factor: Multiplier for the delay after each retry """ - headers: Dict[str, str] = {} + headers: dict[str, str] = {} skip_auto_headers: set[str] = set() if content_type: headers["Content-Type"] = content_type @@ -558,7 +558,7 @@ class ApiClient: *req_meta, retry_count: int, response_content: dict | str = "", - ) -> Dict[str, Any]: + ) -> dict[str, Any]: status_code = exc.status if status_code == 401: user_friendly = "Unauthorized: Please login first to use this node." @@ -659,7 +659,7 @@ class ApiEndpoint(Generic[T, R]): method: HttpMethod, request_model: Type[T], response_model: Type[R], - query_params: Optional[Dict[str, Any]] = None, + query_params: Optional[dict[str, Any]] = None, ): """Initialize an API endpoint definition. @@ -684,11 +684,11 @@ class SynchronousOperation(Generic[T, R]): self, endpoint: ApiEndpoint[T, R], request: T, - files: Optional[Dict[str, Any] | list[tuple[str, Any]]] = None, + files: Optional[dict[str, Any] | list[tuple[str, Any]]] = None, api_base: str | None = None, auth_token: Optional[str] = None, comfy_api_key: Optional[str] = None, - auth_kwargs: Optional[Dict[str, str]] = None, + auth_kwargs: Optional[dict[str, str]] = None, timeout: float = 7200.0, verify_ssl: bool = True, content_type: str = "application/json", @@ -729,7 +729,7 @@ class SynchronousOperation(Generic[T, R]): ) try: - request_dict: Optional[Dict[str, Any]] + request_dict: Optional[dict[str, Any]] if isinstance(self.request, EmptyRequest): request_dict = None else: @@ -782,14 +782,14 @@ class PollingOperation(Generic[T, R]): poll_endpoint: ApiEndpoint[EmptyRequest, R], completed_statuses: list[str], failed_statuses: list[str], - status_extractor: Callable[[R], str], - progress_extractor: Callable[[R], float] | None = None, - result_url_extractor: Callable[[R], str] | None = None, + status_extractor: Callable[[R], Optional[str]], + progress_extractor: Callable[[R], Optional[float]] | None = None, + result_url_extractor: Callable[[R], Optional[str]] | None = None, request: Optional[T] = None, api_base: str | None = None, auth_token: Optional[str] = None, comfy_api_key: Optional[str] = None, - auth_kwargs: Optional[Dict[str, str]] = None, + auth_kwargs: Optional[dict[str, str]] = None, poll_interval: float = 5.0, max_poll_attempts: int = 120, # Default max polling attempts (10 minutes with 5s interval) max_retries: int = 3, # Max retries per individual API call diff --git a/comfy_api_nodes/apis/pika_defs.py b/comfy_api_nodes/apis/pika_defs.py new file mode 100644 index 000000000..232558cd7 --- /dev/null +++ b/comfy_api_nodes/apis/pika_defs.py @@ -0,0 +1,100 @@ +from typing import Optional +from enum import Enum +from pydantic import BaseModel, Field + + +class Pikaffect(str, Enum): + Cake_ify = "Cake-ify" + Crumble = "Crumble" + Crush = "Crush" + Decapitate = "Decapitate" + Deflate = "Deflate" + Dissolve = "Dissolve" + Explode = "Explode" + Eye_pop = "Eye-pop" + Inflate = "Inflate" + Levitate = "Levitate" + Melt = "Melt" + Peel = "Peel" + Poke = "Poke" + Squish = "Squish" + Ta_da = "Ta-da" + Tear = "Tear" + + +class PikaBodyGenerate22C2vGenerate22PikascenesPost(BaseModel): + aspectRatio: Optional[float] = Field(None, description='Aspect ratio (width / height)') + duration: Optional[int] = Field(5) + ingredientsMode: str = Field(...) + negativePrompt: Optional[str] = Field(None) + promptText: Optional[str] = Field(None) + resolution: Optional[str] = Field('1080p') + seed: Optional[int] = Field(None) + + +class PikaGenerateResponse(BaseModel): + video_id: str = Field(...) + + +class PikaBodyGenerate22I2vGenerate22I2vPost(BaseModel): + duration: Optional[int] = 5 + negativePrompt: Optional[str] = Field(None) + promptText: Optional[str] = Field(None) + resolution: Optional[str] = '1080p' + seed: Optional[int] = Field(None) + + +class PikaBodyGenerate22KeyframeGenerate22PikaframesPost(BaseModel): + duration: Optional[int] = Field(None, ge=5, le=10) + negativePrompt: Optional[str] = Field(None) + promptText: str = Field(...) + resolution: Optional[str] = '1080p' + seed: Optional[int] = Field(None) + + +class PikaBodyGenerate22T2vGenerate22T2vPost(BaseModel): + aspectRatio: Optional[float] = Field( + 1.7777777777777777, + description='Aspect ratio (width / height)', + ge=0.4, + le=2.5, + ) + duration: Optional[int] = 5 + negativePrompt: Optional[str] = Field(None) + promptText: str = Field(...) + resolution: Optional[str] = '1080p' + seed: Optional[int] = Field(None) + + +class PikaBodyGeneratePikadditionsGeneratePikadditionsPost(BaseModel): + negativePrompt: Optional[str] = Field(None) + promptText: Optional[str] = Field(None) + seed: Optional[int] = Field(None) + + +class PikaBodyGeneratePikaffectsGeneratePikaffectsPost(BaseModel): + negativePrompt: Optional[str] = Field(None) + pikaffect: Optional[str] = None + promptText: Optional[str] = Field(None) + seed: Optional[int] = Field(None) + + +class PikaBodyGeneratePikaswapsGeneratePikaswapsPost(BaseModel): + negativePrompt: Optional[str] = Field(None) + promptText: Optional[str] = Field(None) + seed: Optional[int] = Field(None) + modifyRegionRoi: Optional[str] = Field(None) + + +class PikaStatusEnum(str, Enum): + queued = "queued" + started = "started" + finished = "finished" + failed = "failed" + + +class PikaVideoResponse(BaseModel): + id: str = Field(...) + progress: Optional[int] = Field(None) + status: PikaStatusEnum + url: Optional[str] = Field(None) diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py index 35d6baf1c..10f11666d 100644 --- a/comfy_api_nodes/nodes_pika.py +++ b/comfy_api_nodes/nodes_pika.py @@ -8,30 +8,17 @@ from __future__ import annotations from io import BytesIO import logging from typing import Optional, TypeVar -from enum import Enum -import numpy as np import torch from typing_extensions import override -from comfy_api.latest import ComfyExtension, io as comfy_io -from comfy_api.input_impl import VideoFromFile +from comfy_api.latest import ComfyExtension, comfy_io from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoInput from comfy_api_nodes.apinode_utils import ( download_url_to_video_output, tensor_to_bytesio, ) -from comfy_api_nodes.apis import ( - PikaBodyGenerate22C2vGenerate22PikascenesPost, - PikaBodyGenerate22I2vGenerate22I2vPost, - PikaBodyGenerate22KeyframeGenerate22PikaframesPost, - PikaBodyGenerate22T2vGenerate22T2vPost, - PikaBodyGeneratePikadditionsGeneratePikadditionsPost, - PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - PikaBodyGeneratePikaswapsGeneratePikaswapsPost, - PikaGenerateResponse, - PikaVideoResponse, -) +from comfy_api_nodes.apis import pika_defs from comfy_api_nodes.apis.client import ( ApiEndpoint, EmptyRequest, @@ -55,116 +42,36 @@ PATH_PIKASCENES = f"/proxy/pika/generate/{PIKA_API_VERSION}/pikascenes" PATH_VIDEO_GET = "/proxy/pika/videos" -class PikaDurationEnum(int, Enum): - integer_5 = 5 - integer_10 = 10 - - -class PikaResolutionEnum(str, Enum): - field_1080p = "1080p" - field_720p = "720p" - - -class Pikaffect(str, Enum): - Cake_ify = "Cake-ify" - Crumble = "Crumble" - Crush = "Crush" - Decapitate = "Decapitate" - Deflate = "Deflate" - Dissolve = "Dissolve" - Explode = "Explode" - Eye_pop = "Eye-pop" - Inflate = "Inflate" - Levitate = "Levitate" - Melt = "Melt" - Peel = "Peel" - Poke = "Poke" - Squish = "Squish" - Ta_da = "Ta-da" - Tear = "Tear" - - -class PikaApiError(Exception): - """Exception for Pika API errors.""" - - pass - - -def is_valid_video_response(response: PikaVideoResponse) -> bool: - """Check if the video response is valid.""" - return hasattr(response, "url") and response.url is not None - - -def is_valid_initial_response(response: PikaGenerateResponse) -> bool: - """Check if the initial response is valid.""" - return hasattr(response, "video_id") and response.video_id is not None - - -async def poll_for_task_status( - task_id: str, +async def execute_task( + initial_operation: SynchronousOperation[R, pika_defs.PikaGenerateResponse], auth_kwargs: Optional[dict[str, str]] = None, node_id: Optional[str] = None, -) -> PikaGenerateResponse: - polling_operation = PollingOperation( +) -> comfy_io.NodeOutput: + task_id = (await initial_operation.execute()).video_id + final_response: pika_defs.PikaVideoResponse = await PollingOperation( poll_endpoint=ApiEndpoint( path=f"{PATH_VIDEO_GET}/{task_id}", method=HttpMethod.GET, request_model=EmptyRequest, - response_model=PikaVideoResponse, + response_model=pika_defs.PikaVideoResponse, ), - completed_statuses=[ - "finished", - ], + completed_statuses=["finished"], failed_statuses=["failed", "cancelled"], - status_extractor=lambda response: ( - response.status.value if response.status else None - ), - progress_extractor=lambda response: ( - response.progress if hasattr(response, "progress") else None - ), + status_extractor=lambda response: (response.status.value if response.status else None), + progress_extractor=lambda response: (response.progress if hasattr(response, "progress") else None), auth_kwargs=auth_kwargs, - result_url_extractor=lambda response: ( - response.url if hasattr(response, "url") else None - ), + result_url_extractor=lambda response: (response.url if hasattr(response, "url") else None), node_id=node_id, - estimated_duration=60 - ) - return await polling_operation.execute() - - -async def execute_task( - initial_operation: SynchronousOperation[R, PikaGenerateResponse], - auth_kwargs: Optional[dict[str, str]] = None, - node_id: Optional[str] = None, -) -> tuple[VideoFromFile]: - """Executes the initial operation then polls for the task status until it is completed. - - Args: - initial_operation: The initial operation to execute. - auth_kwargs: The authentication token(s) to use for the API call. - - Returns: - A tuple containing the video file as a VIDEO output. - """ - initial_response = await initial_operation.execute() - if not is_valid_initial_response(initial_response): - error_msg = f"Pika initial request failed. Code: {initial_response.code}, Message: {initial_response.message}, Data: {initial_response.data}" + estimated_duration=60, + max_poll_attempts=240, + ).execute() + if not final_response.url: + error_msg = f"Pika task {task_id} succeeded but no video data found in response:\n{final_response}" logging.error(error_msg) - raise PikaApiError(error_msg) - - task_id = initial_response.video_id - final_response = await poll_for_task_status(task_id, auth_kwargs, node_id=node_id) - if not is_valid_video_response(final_response): - error_msg = ( - f"Pika task {task_id} succeeded but no video data found in response." - ) - logging.error(error_msg) - raise PikaApiError(error_msg) - - video_url = str(final_response.url) + raise Exception(error_msg) + video_url = final_response.url logging.info("Pika task %s succeeded. Video URL: %s", task_id, video_url) - - return (await download_url_to_video_output(video_url),) + return comfy_io.NodeOutput(await download_url_to_video_output(video_url)) def get_base_inputs_types() -> list[comfy_io.Input]: @@ -173,16 +80,12 @@ def get_base_inputs_types() -> list[comfy_io.Input]: comfy_io.String.Input("prompt_text", multiline=True), comfy_io.String.Input("negative_prompt", multiline=True), comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), - comfy_io.Combo.Input( - "resolution", options=PikaResolutionEnum, default=PikaResolutionEnum.field_1080p - ), - comfy_io.Combo.Input( - "duration", options=PikaDurationEnum, default=PikaDurationEnum.integer_5 - ), + comfy_io.Combo.Input("resolution", options=["1080p", "720p"], default="1080p"), + comfy_io.Combo.Input("duration", options=[5, 10], default=5), ] -class PikaImageToVideoV2_2(comfy_io.ComfyNode): +class PikaImageToVideo(comfy_io.ComfyNode): """Pika 2.2 Image to Video Node.""" @classmethod @@ -215,14 +118,9 @@ class PikaImageToVideoV2_2(comfy_io.ComfyNode): resolution: str, duration: int, ) -> comfy_io.NodeOutput: - # Convert image to BytesIO image_bytes_io = tensor_to_bytesio(image) - image_bytes_io.seek(0) - pika_files = {"image": ("image.png", image_bytes_io, "image/png")} - - # Prepare non-file data - pika_request_data = PikaBodyGenerate22I2vGenerate22I2vPost( + pika_request_data = pika_defs.PikaBodyGenerate22I2vGenerate22I2vPost( promptText=prompt_text, negativePrompt=negative_prompt, seed=seed, @@ -237,8 +135,8 @@ class PikaImageToVideoV2_2(comfy_io.ComfyNode): endpoint=ApiEndpoint( path=PATH_IMAGE_TO_VIDEO, method=HttpMethod.POST, - request_model=PikaBodyGenerate22I2vGenerate22I2vPost, - response_model=PikaGenerateResponse, + request_model=pika_defs.PikaBodyGenerate22I2vGenerate22I2vPost, + response_model=pika_defs.PikaGenerateResponse, ), request=pika_request_data, files=pika_files, @@ -248,7 +146,7 @@ class PikaImageToVideoV2_2(comfy_io.ComfyNode): return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaTextToVideoNodeV2_2(comfy_io.ComfyNode): +class PikaTextToVideoNode(comfy_io.ComfyNode): """Pika Text2Video v2.2 Node.""" @classmethod @@ -296,10 +194,10 @@ class PikaTextToVideoNodeV2_2(comfy_io.ComfyNode): endpoint=ApiEndpoint( path=PATH_TEXT_TO_VIDEO, method=HttpMethod.POST, - request_model=PikaBodyGenerate22T2vGenerate22T2vPost, - response_model=PikaGenerateResponse, + request_model=pika_defs.PikaBodyGenerate22T2vGenerate22T2vPost, + response_model=pika_defs.PikaGenerateResponse, ), - request=PikaBodyGenerate22T2vGenerate22T2vPost( + request=pika_defs.PikaBodyGenerate22T2vGenerate22T2vPost( promptText=prompt_text, negativePrompt=negative_prompt, seed=seed, @@ -313,7 +211,7 @@ class PikaTextToVideoNodeV2_2(comfy_io.ComfyNode): return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaScenesV2_2(comfy_io.ComfyNode): +class PikaScenes(comfy_io.ComfyNode): """PikaScenes v2.2 Node.""" @classmethod @@ -389,7 +287,6 @@ class PikaScenesV2_2(comfy_io.ComfyNode): image_ingredient_4: Optional[torch.Tensor] = None, image_ingredient_5: Optional[torch.Tensor] = None, ) -> comfy_io.NodeOutput: - # Convert all passed images to BytesIO all_image_bytes_io = [] for image in [ image_ingredient_1, @@ -399,16 +296,14 @@ class PikaScenesV2_2(comfy_io.ComfyNode): image_ingredient_5, ]: if image is not None: - image_bytes_io = tensor_to_bytesio(image) - image_bytes_io.seek(0) - all_image_bytes_io.append(image_bytes_io) + all_image_bytes_io.append(tensor_to_bytesio(image)) pika_files = [ ("images", (f"image_{i}.png", image_bytes_io, "image/png")) for i, image_bytes_io in enumerate(all_image_bytes_io) ] - pika_request_data = PikaBodyGenerate22C2vGenerate22PikascenesPost( + pika_request_data = pika_defs.PikaBodyGenerate22C2vGenerate22PikascenesPost( ingredientsMode=ingredients_mode, promptText=prompt_text, negativePrompt=negative_prompt, @@ -425,8 +320,8 @@ class PikaScenesV2_2(comfy_io.ComfyNode): endpoint=ApiEndpoint( path=PATH_PIKASCENES, method=HttpMethod.POST, - request_model=PikaBodyGenerate22C2vGenerate22PikascenesPost, - response_model=PikaGenerateResponse, + request_model=pika_defs.PikaBodyGenerate22C2vGenerate22PikascenesPost, + response_model=pika_defs.PikaGenerateResponse, ), request=pika_request_data, files=pika_files, @@ -477,22 +372,16 @@ class PikAdditionsNode(comfy_io.ComfyNode): negative_prompt: str, seed: int, ) -> comfy_io.NodeOutput: - # Convert video to BytesIO video_bytes_io = BytesIO() video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264) video_bytes_io.seek(0) - # Convert image to BytesIO image_bytes_io = tensor_to_bytesio(image) - image_bytes_io.seek(0) - pika_files = { "video": ("video.mp4", video_bytes_io, "video/mp4"), "image": ("image.png", image_bytes_io, "image/png"), } - - # Prepare non-file data - pika_request_data = PikaBodyGeneratePikadditionsGeneratePikadditionsPost( + pika_request_data = pika_defs.PikaBodyGeneratePikadditionsGeneratePikadditionsPost( promptText=prompt_text, negativePrompt=negative_prompt, seed=seed, @@ -505,8 +394,8 @@ class PikAdditionsNode(comfy_io.ComfyNode): endpoint=ApiEndpoint( path=PATH_PIKADDITIONS, method=HttpMethod.POST, - request_model=PikaBodyGeneratePikadditionsGeneratePikadditionsPost, - response_model=PikaGenerateResponse, + request_model=pika_defs.PikaBodyGeneratePikadditionsGeneratePikadditionsPost, + response_model=pika_defs.PikaGenerateResponse, ), request=pika_request_data, files=pika_files, @@ -529,11 +418,25 @@ class PikaSwapsNode(comfy_io.ComfyNode): category="api node/video/Pika", inputs=[ comfy_io.Video.Input("video", tooltip="The video to swap an object in."), - comfy_io.Image.Input("image", tooltip="The image used to replace the masked object in the video."), - comfy_io.Mask.Input("mask", tooltip="Use the mask to define areas in the video to replace"), - comfy_io.String.Input("prompt_text", multiline=True), - comfy_io.String.Input("negative_prompt", multiline=True), - comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), + comfy_io.Image.Input( + "image", + tooltip="The image used to replace the masked object in the video.", + optional=True, + ), + comfy_io.Mask.Input( + "mask", + tooltip="Use the mask to define areas in the video to replace.", + optional=True, + ), + comfy_io.String.Input("prompt_text", multiline=True, optional=True), + comfy_io.String.Input("negative_prompt", multiline=True, optional=True), + comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True, optional=True), + comfy_io.String.Input( + "region_to_modify", + multiline=True, + optional=True, + tooltip="Plaintext description of the object / region to modify.", + ), ], outputs=[comfy_io.Video.Output()], hidden=[ @@ -548,41 +451,29 @@ class PikaSwapsNode(comfy_io.ComfyNode): async def execute( cls, video: VideoInput, - image: torch.Tensor, - mask: torch.Tensor, - prompt_text: str, - negative_prompt: str, - seed: int, + image: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, + prompt_text: str = "", + negative_prompt: str = "", + seed: int = 0, + region_to_modify: str = "", ) -> comfy_io.NodeOutput: - # Convert video to BytesIO video_bytes_io = BytesIO() video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264) video_bytes_io.seek(0) - - # Convert mask to binary mask with three channels - mask = torch.round(mask) - mask = mask.repeat(1, 3, 1, 1) - - # Convert 3-channel binary mask to BytesIO - mask_bytes_io = BytesIO() - mask_bytes_io.write(mask.numpy().astype(np.uint8)) - mask_bytes_io.seek(0) - - # Convert image to BytesIO - image_bytes_io = tensor_to_bytesio(image) - image_bytes_io.seek(0) - pika_files = { "video": ("video.mp4", video_bytes_io, "video/mp4"), - "image": ("image.png", image_bytes_io, "image/png"), - "modifyRegionMask": ("mask.png", mask_bytes_io, "image/png"), } + if mask is not None: + pika_files["modifyRegionMask"] = ("mask.png", tensor_to_bytesio(mask), "image/png") + if image is not None: + pika_files["image"] = ("image.png", tensor_to_bytesio(image), "image/png") - # Prepare non-file data - pika_request_data = PikaBodyGeneratePikaswapsGeneratePikaswapsPost( + pika_request_data = pika_defs.PikaBodyGeneratePikaswapsGeneratePikaswapsPost( promptText=prompt_text, negativePrompt=negative_prompt, seed=seed, + modifyRegionRoi=region_to_modify if region_to_modify else None, ) auth = { "auth_token": cls.hidden.auth_token_comfy_org, @@ -590,10 +481,10 @@ class PikaSwapsNode(comfy_io.ComfyNode): } initial_operation = SynchronousOperation( endpoint=ApiEndpoint( - path=PATH_PIKADDITIONS, + path=PATH_PIKASWAPS, method=HttpMethod.POST, - request_model=PikaBodyGeneratePikadditionsGeneratePikadditionsPost, - response_model=PikaGenerateResponse, + request_model=pika_defs.PikaBodyGeneratePikaswapsGeneratePikaswapsPost, + response_model=pika_defs.PikaGenerateResponse, ), request=pika_request_data, files=pika_files, @@ -616,7 +507,7 @@ class PikaffectsNode(comfy_io.ComfyNode): inputs=[ comfy_io.Image.Input("image", tooltip="The reference image to apply the Pikaffect to."), comfy_io.Combo.Input( - "pikaffect", options=Pikaffect, default="Cake-ify" + "pikaffect", options=pika_defs.Pikaffect, default="Cake-ify" ), comfy_io.String.Input("prompt_text", multiline=True), comfy_io.String.Input("negative_prompt", multiline=True), @@ -648,10 +539,10 @@ class PikaffectsNode(comfy_io.ComfyNode): endpoint=ApiEndpoint( path=PATH_PIKAFFECTS, method=HttpMethod.POST, - request_model=PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - response_model=PikaGenerateResponse, + request_model=pika_defs.PikaBodyGeneratePikaffectsGeneratePikaffectsPost, + response_model=pika_defs.PikaGenerateResponse, ), - request=PikaBodyGeneratePikaffectsGeneratePikaffectsPost( + request=pika_defs.PikaBodyGeneratePikaffectsGeneratePikaffectsPost( pikaffect=pikaffect, promptText=prompt_text, negativePrompt=negative_prompt, @@ -664,7 +555,7 @@ class PikaffectsNode(comfy_io.ComfyNode): return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaStartEndFrameNode2_2(comfy_io.ComfyNode): +class PikaStartEndFrameNode(comfy_io.ComfyNode): """PikaFrames v2.2 Node.""" @classmethod @@ -711,10 +602,10 @@ class PikaStartEndFrameNode2_2(comfy_io.ComfyNode): endpoint=ApiEndpoint( path=PATH_PIKAFRAMES, method=HttpMethod.POST, - request_model=PikaBodyGenerate22KeyframeGenerate22PikaframesPost, - response_model=PikaGenerateResponse, + request_model=pika_defs.PikaBodyGenerate22KeyframeGenerate22PikaframesPost, + response_model=pika_defs.PikaGenerateResponse, ), - request=PikaBodyGenerate22KeyframeGenerate22PikaframesPost( + request=pika_defs.PikaBodyGenerate22KeyframeGenerate22PikaframesPost( promptText=prompt_text, negativePrompt=negative_prompt, seed=seed, @@ -732,13 +623,13 @@ class PikaApiNodesExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: return [ - PikaImageToVideoV2_2, - PikaTextToVideoNodeV2_2, - PikaScenesV2_2, + PikaImageToVideo, + PikaTextToVideoNode, + PikaScenes, PikAdditionsNode, PikaSwapsNode, PikaffectsNode, - PikaStartEndFrameNode2_2, + PikaStartEndFrameNode, ] From fc0fbf141c7deb444fe730af2f2db8e2beddaf60 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 10 Oct 2025 01:18:23 +0300 Subject: [PATCH 428/759] convert nodes_sd3.py and nodes_slg.py to V3 schema (#10162) --- comfy_extras/nodes_sd3.py | 232 +++++++++++++++++++++++++------------- comfy_extras/nodes_slg.py | 108 +++++++++++------- 2 files changed, 219 insertions(+), 121 deletions(-) diff --git a/comfy_extras/nodes_sd3.py b/comfy_extras/nodes_sd3.py index d75b29e60..14782cb2b 100644 --- a/comfy_extras/nodes_sd3.py +++ b/comfy_extras/nodes_sd3.py @@ -3,64 +3,83 @@ import comfy.sd import comfy.model_management import nodes import torch -import comfy_extras.nodes_slg +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io +from comfy_extras.nodes_slg import SkipLayerGuidanceDiT -class TripleCLIPLoader: +class TripleCLIPLoader(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ), "clip_name2": (folder_paths.get_filename_list("text_encoders"), ), "clip_name3": (folder_paths.get_filename_list("text_encoders"), ) - }} - RETURN_TYPES = ("CLIP",) - FUNCTION = "load_clip" + def define_schema(cls): + return io.Schema( + node_id="TripleCLIPLoader", + category="advanced/loaders", + description="[Recipes]\n\nsd3: clip-l, clip-g, t5", + inputs=[ + io.Combo.Input("clip_name1", options=folder_paths.get_filename_list("text_encoders")), + io.Combo.Input("clip_name2", options=folder_paths.get_filename_list("text_encoders")), + io.Combo.Input("clip_name3", options=folder_paths.get_filename_list("text_encoders")), + ], + outputs=[ + io.Clip.Output(), + ], + ) - CATEGORY = "advanced/loaders" - - DESCRIPTION = "[Recipes]\n\nsd3: clip-l, clip-g, t5" - - def load_clip(self, clip_name1, clip_name2, clip_name3): + @classmethod + def execute(cls, clip_name1, clip_name2, clip_name3) -> io.NodeOutput: clip_path1 = folder_paths.get_full_path_or_raise("text_encoders", clip_name1) clip_path2 = folder_paths.get_full_path_or_raise("text_encoders", clip_name2) clip_path3 = folder_paths.get_full_path_or_raise("text_encoders", clip_name3) clip = comfy.sd.load_clip(ckpt_paths=[clip_path1, clip_path2, clip_path3], embedding_directory=folder_paths.get_folder_paths("embeddings")) - return (clip,) + return io.NodeOutput(clip) + + load_clip = execute # TODO: remove -class EmptySD3LatentImage: - def __init__(self): - self.device = comfy.model_management.intermediate_device() +class EmptySD3LatentImage(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="EmptySD3LatentImage", + category="latent/sd3", + inputs=[ + io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(), + ], + ) @classmethod - def INPUT_TYPES(s): - return {"required": { "width": ("INT", {"default": 1024, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 1024, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" + def execute(cls, width, height, batch_size=1) -> io.NodeOutput: + latent = torch.zeros([batch_size, 16, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + return io.NodeOutput({"samples":latent}) - CATEGORY = "latent/sd3" - - def generate(self, width, height, batch_size=1): - latent = torch.zeros([batch_size, 16, height // 8, width // 8], device=self.device) - return ({"samples":latent}, ) + generate = execute # TODO: remove -class CLIPTextEncodeSD3: +class CLIPTextEncodeSD3(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "clip_l": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "clip_g": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "t5xxl": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "empty_padding": (["none", "empty_prompt"], ) - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodeSD3", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("clip_l", multiline=True, dynamic_prompts=True), + io.String.Input("clip_g", multiline=True, dynamic_prompts=True), + io.String.Input("t5xxl", multiline=True, dynamic_prompts=True), + io.Combo.Input("empty_padding", options=["none", "empty_prompt"]), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - CATEGORY = "advanced/conditioning" - - def encode(self, clip, clip_l, clip_g, t5xxl, empty_padding): + @classmethod + def execute(cls, clip, clip_l, clip_g, t5xxl, empty_padding) -> io.NodeOutput: no_padding = empty_padding == "none" tokens = clip.tokenize(clip_g) @@ -82,57 +101,112 @@ class CLIPTextEncodeSD3: tokens["l"] += empty["l"] while len(tokens["l"]) > len(tokens["g"]): tokens["g"] += empty["g"] - return (clip.encode_from_tokens_scheduled(tokens), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens)) + + encode = execute # TODO: remove -class ControlNetApplySD3(nodes.ControlNetApplyAdvanced): +class ControlNetApplySD3(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "control_net": ("CONTROL_NET", ), - "vae": ("VAE", ), - "image": ("IMAGE", ), - "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}), - "end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001}) - }} - CATEGORY = "conditioning/controlnet" - DEPRECATED = True + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="ControlNetApplySD3", + display_name="Apply Controlnet with VAE", + category="conditioning/controlnet", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.ControlNet.Input("control_net"), + io.Vae.Input("vae"), + io.Image.Input("image"), + io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=1.0, min=0.0, max=1.0, step=0.001), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + ], + is_deprecated=True, + ) + + @classmethod + def execute(cls, positive, negative, control_net, image, strength, start_percent, end_percent, vae=None) -> io.NodeOutput: + if strength == 0: + return io.NodeOutput(positive, negative) + + control_hint = image.movedim(-1, 1) + cnets = {} + + out = [] + for conditioning in [positive, negative]: + c = [] + for t in conditioning: + d = t[1].copy() + + prev_cnet = d.get('control', None) + if prev_cnet in cnets: + c_net = cnets[prev_cnet] + else: + c_net = control_net.copy().set_cond_hint(control_hint, strength, (start_percent, end_percent), + vae=vae, extra_concat=[]) + c_net.set_previous_controlnet(prev_cnet) + cnets[prev_cnet] = c_net + + d['control'] = c_net + d['control_apply_to_uncond'] = False + n = [t[0], d] + c.append(n) + out.append(c) + return io.NodeOutput(out[0], out[1]) + + apply_controlnet = execute # TODO: remove -class SkipLayerGuidanceSD3(comfy_extras.nodes_slg.SkipLayerGuidanceDiT): +class SkipLayerGuidanceSD3(io.ComfyNode): ''' Enhance guidance towards detailed dtructure by having another set of CFG negative with skipped layers. Inspired by Perturbed Attention Guidance (https://arxiv.org/abs/2403.17377) Experimental implementation by Dango233@StabilityAI. ''' + @classmethod - def INPUT_TYPES(s): - return {"required": {"model": ("MODEL", ), - "layers": ("STRING", {"default": "7, 8, 9", "multiline": False}), - "scale": ("FLOAT", {"default": 3.0, "min": 0.0, "max": 10.0, "step": 0.1}), - "start_percent": ("FLOAT", {"default": 0.01, "min": 0.0, "max": 1.0, "step": 0.001}), - "end_percent": ("FLOAT", {"default": 0.15, "min": 0.0, "max": 1.0, "step": 0.001}) - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "skip_guidance_sd3" + def define_schema(cls): + return io.Schema( + node_id="SkipLayerGuidanceSD3", + category="advanced/guidance", + description="Generic version of SkipLayerGuidance node that can be used on every DiT model.", + inputs=[ + io.Model.Input("model"), + io.String.Input("layers", default="7, 8, 9", multiline=False), + io.Float.Input("scale", default=3.0, min=0.0, max=10.0, step=0.1), + io.Float.Input("start_percent", default=0.01, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=0.15, min=0.0, max=1.0, step=0.001), + ], + outputs=[ + io.Model.Output(), + ], + is_experimental=True, + ) - CATEGORY = "advanced/guidance" + @classmethod + def execute(cls, model, layers, scale, start_percent, end_percent) -> io.NodeOutput: + return SkipLayerGuidanceDiT().execute(model=model, scale=scale, start_percent=start_percent, end_percent=end_percent, double_layers=layers) - def skip_guidance_sd3(self, model, layers, scale, start_percent, end_percent): - return self.skip_guidance(model=model, scale=scale, start_percent=start_percent, end_percent=end_percent, double_layers=layers) + skip_guidance_sd3 = execute # TODO: remove -NODE_CLASS_MAPPINGS = { - "TripleCLIPLoader": TripleCLIPLoader, - "EmptySD3LatentImage": EmptySD3LatentImage, - "CLIPTextEncodeSD3": CLIPTextEncodeSD3, - "ControlNetApplySD3": ControlNetApplySD3, - "SkipLayerGuidanceSD3": SkipLayerGuidanceSD3, -} +class SD3Extension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TripleCLIPLoader, + EmptySD3LatentImage, + CLIPTextEncodeSD3, + ControlNetApplySD3, + SkipLayerGuidanceSD3, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - # Sampling - "ControlNetApplySD3": "Apply Controlnet with VAE", -} + +async def comfy_entrypoint() -> SD3Extension: + return SD3Extension() diff --git a/comfy_extras/nodes_slg.py b/comfy_extras/nodes_slg.py index 7adff202e..f462faa8f 100644 --- a/comfy_extras/nodes_slg.py +++ b/comfy_extras/nodes_slg.py @@ -1,33 +1,40 @@ import comfy.model_patcher import comfy.samplers import re +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class SkipLayerGuidanceDiT: +class SkipLayerGuidanceDiT(io.ComfyNode): ''' Enhance guidance towards detailed dtructure by having another set of CFG negative with skipped layers. Inspired by Perturbed Attention Guidance (https://arxiv.org/abs/2403.17377) Original experimental implementation for SD3 by Dango233@StabilityAI. ''' + @classmethod - def INPUT_TYPES(s): - return {"required": {"model": ("MODEL", ), - "double_layers": ("STRING", {"default": "7, 8, 9", "multiline": False}), - "single_layers": ("STRING", {"default": "7, 8, 9", "multiline": False}), - "scale": ("FLOAT", {"default": 3.0, "min": 0.0, "max": 10.0, "step": 0.1}), - "start_percent": ("FLOAT", {"default": 0.01, "min": 0.0, "max": 1.0, "step": 0.001}), - "end_percent": ("FLOAT", {"default": 0.15, "min": 0.0, "max": 1.0, "step": 0.001}), - "rescaling_scale": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "skip_guidance" - EXPERIMENTAL = True + def define_schema(cls): + return io.Schema( + node_id="SkipLayerGuidanceDiT", + category="advanced/guidance", + description="Generic version of SkipLayerGuidance node that can be used on every DiT model.", + is_experimental=True, + inputs=[ + io.Model.Input("model"), + io.String.Input("double_layers", default="7, 8, 9"), + io.String.Input("single_layers", default="7, 8, 9"), + io.Float.Input("scale", default=3.0, min=0.0, max=10.0, step=0.1), + io.Float.Input("start_percent", default=0.01, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=0.15, min=0.0, max=1.0, step=0.001), + io.Float.Input("rescaling_scale", default=0.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[ + io.Model.Output(), + ], + ) - DESCRIPTION = "Generic version of SkipLayerGuidance node that can be used on every DiT model." - - CATEGORY = "advanced/guidance" - - def skip_guidance(self, model, scale, start_percent, end_percent, double_layers="", single_layers="", rescaling_scale=0): + @classmethod + def execute(cls, model, scale, start_percent, end_percent, double_layers="", single_layers="", rescaling_scale=0) -> io.NodeOutput: # check if layer is comma separated integers def skip(args, extra_args): return args @@ -43,7 +50,7 @@ class SkipLayerGuidanceDiT: single_layers = [int(i) for i in single_layers] if len(double_layers) == 0 and len(single_layers) == 0: - return (model, ) + return io.NodeOutput(model) def post_cfg_function(args): model = args["model"] @@ -76,29 +83,36 @@ class SkipLayerGuidanceDiT: m = model.clone() m.set_model_sampler_post_cfg_function(post_cfg_function) - return (m, ) + return io.NodeOutput(m) -class SkipLayerGuidanceDiTSimple: + skip_guidance = execute # TODO: remove + + +class SkipLayerGuidanceDiTSimple(io.ComfyNode): ''' Simple version of the SkipLayerGuidanceDiT node that only modifies the uncond pass. ''' @classmethod - def INPUT_TYPES(s): - return {"required": {"model": ("MODEL", ), - "double_layers": ("STRING", {"default": "7, 8, 9", "multiline": False}), - "single_layers": ("STRING", {"default": "7, 8, 9", "multiline": False}), - "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}), - "end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "skip_guidance" - EXPERIMENTAL = True + def define_schema(cls): + return io.Schema( + node_id="SkipLayerGuidanceDiTSimple", + category="advanced/guidance", + description="Simple version of the SkipLayerGuidanceDiT node that only modifies the uncond pass.", + is_experimental=True, + inputs=[ + io.Model.Input("model"), + io.String.Input("double_layers", default="7, 8, 9"), + io.String.Input("single_layers", default="7, 8, 9"), + io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=1.0, min=0.0, max=1.0, step=0.001), + ], + outputs=[ + io.Model.Output(), + ], + ) - DESCRIPTION = "Simple version of the SkipLayerGuidanceDiT node that only modifies the uncond pass." - - CATEGORY = "advanced/guidance" - - def skip_guidance(self, model, start_percent, end_percent, double_layers="", single_layers=""): + @classmethod + def execute(cls, model, start_percent, end_percent, double_layers="", single_layers="") -> io.NodeOutput: def skip(args, extra_args): return args @@ -113,7 +127,7 @@ class SkipLayerGuidanceDiTSimple: single_layers = [int(i) for i in single_layers] if len(double_layers) == 0 and len(single_layers) == 0: - return (model, ) + return io.NodeOutput(model) def calc_cond_batch_function(args): x = args["input"] @@ -144,9 +158,19 @@ class SkipLayerGuidanceDiTSimple: m = model.clone() m.set_model_sampler_calc_cond_batch_function(calc_cond_batch_function) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "SkipLayerGuidanceDiT": SkipLayerGuidanceDiT, - "SkipLayerGuidanceDiTSimple": SkipLayerGuidanceDiTSimple, -} + skip_guidance = execute # TODO: remove + + +class SkipLayerGuidanceExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + SkipLayerGuidanceDiT, + SkipLayerGuidanceDiTSimple, + ] + + +async def comfy_entrypoint() -> SkipLayerGuidanceExtension: + return SkipLayerGuidanceExtension() From f1dd6e50f891b1d2b17e4b8d26d422634fe49595 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 9 Oct 2025 16:02:40 -0700 Subject: [PATCH 429/759] Fix bug with applying loras on fp8 scaled without fp8 ops. (#10279) --- comfy/model_patcher.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index e8c859689..c0b68fb8c 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -130,17 +130,21 @@ class LowVramPatch: self.set_func = set_func def __call__(self, weight): + intermediate_dtype = weight.dtype if self.convert_func is not None: weight = self.convert_func(weight.to(dtype=torch.float32, copy=True), inplace=True) - intermediate_dtype = weight.dtype - if self.set_func is None and intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops + if intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops intermediate_dtype = torch.float32 - return comfy.float.stochastic_rounding(comfy.lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype), weight.dtype, seed=string_to_seed(self.key)) + out = comfy.lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype) + if self.set_func is None: + return comfy.float.stochastic_rounding(out, weight.dtype, seed=string_to_seed(self.key)) + else: + return self.set_func(out, seed=string_to_seed(self.key), return_weight=True) out = comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype) if self.set_func is not None: - return self.set_func(out, seed=string_to_seed(self.key), return_weight=True) + return self.set_func(out, seed=string_to_seed(self.key), return_weight=True).to(dtype=intermediate_dtype) else: return out From 90853fb9cd42ebbee7b3fcf46e518e5632912b11 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 10 Oct 2025 02:07:17 +0300 Subject: [PATCH 430/759] convert nodes_flux to V3 schema (#10122) --- comfy_extras/nodes_flux.py | 189 ++++++++++++++++++++++--------------- 1 file changed, 115 insertions(+), 74 deletions(-) diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py index 25e029ffd..ce1b2e89f 100644 --- a/comfy_extras/nodes_flux.py +++ b/comfy_extras/nodes_flux.py @@ -1,60 +1,80 @@ import node_helpers import comfy.utils +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class CLIPTextEncodeFlux: + +class CLIPTextEncodeFlux(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "clip_l": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "t5xxl": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "guidance": ("FLOAT", {"default": 3.5, "min": 0.0, "max": 100.0, "step": 0.1}), - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodeFlux", + category="advanced/conditioning/flux", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("clip_l", multiline=True, dynamic_prompts=True), + io.String.Input("t5xxl", multiline=True, dynamic_prompts=True), + io.Float.Input("guidance", default=3.5, min=0.0, max=100.0, step=0.1), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - CATEGORY = "advanced/conditioning/flux" - - def encode(self, clip, clip_l, t5xxl, guidance): + @classmethod + def execute(cls, clip, clip_l, t5xxl, guidance) -> io.NodeOutput: tokens = clip.tokenize(clip_l) tokens["t5xxl"] = clip.tokenize(t5xxl)["t5xxl"] - return (clip.encode_from_tokens_scheduled(tokens, add_dict={"guidance": guidance}), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens, add_dict={"guidance": guidance})) -class FluxGuidance: + encode = execute # TODO: remove + + +class FluxGuidance(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "conditioning": ("CONDITIONING", ), - "guidance": ("FLOAT", {"default": 3.5, "min": 0.0, "max": 100.0, "step": 0.1}), - }} + def define_schema(cls): + return io.Schema( + node_id="FluxGuidance", + category="advanced/conditioning/flux", + inputs=[ + io.Conditioning.Input("conditioning"), + io.Float.Input("guidance", default=3.5, min=0.0, max=100.0, step=0.1), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "append" - - CATEGORY = "advanced/conditioning/flux" - - def append(self, conditioning, guidance): + @classmethod + def execute(cls, conditioning, guidance) -> io.NodeOutput: c = node_helpers.conditioning_set_values(conditioning, {"guidance": guidance}) - return (c, ) + return io.NodeOutput(c) + + append = execute # TODO: remove -class FluxDisableGuidance: +class FluxDisableGuidance(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "conditioning": ("CONDITIONING", ), - }} + def define_schema(cls): + return io.Schema( + node_id="FluxDisableGuidance", + category="advanced/conditioning/flux", + description="This node completely disables the guidance embed on Flux and Flux like models", + inputs=[ + io.Conditioning.Input("conditioning"), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "append" - - CATEGORY = "advanced/conditioning/flux" - DESCRIPTION = "This node completely disables the guidance embed on Flux and Flux like models" - - def append(self, conditioning): + @classmethod + def execute(cls, conditioning) -> io.NodeOutput: c = node_helpers.conditioning_set_values(conditioning, {"guidance": None}) - return (c, ) + return io.NodeOutput(c) + + append = execute # TODO: remove PREFERED_KONTEXT_RESOLUTIONS = [ @@ -78,52 +98,73 @@ PREFERED_KONTEXT_RESOLUTIONS = [ ] -class FluxKontextImageScale: +class FluxKontextImageScale(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"image": ("IMAGE", ), - }, - } + def define_schema(cls): + return io.Schema( + node_id="FluxKontextImageScale", + category="advanced/conditioning/flux", + description="This node resizes the image to one that is more optimal for flux kontext.", + inputs=[ + io.Image.Input("image"), + ], + outputs=[ + io.Image.Output(), + ], + ) - RETURN_TYPES = ("IMAGE",) - FUNCTION = "scale" - - CATEGORY = "advanced/conditioning/flux" - DESCRIPTION = "This node resizes the image to one that is more optimal for flux kontext." - - def scale(self, image): + @classmethod + def execute(cls, image) -> io.NodeOutput: width = image.shape[2] height = image.shape[1] aspect_ratio = width / height _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS) image = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "lanczos", "center").movedim(1, -1) - return (image, ) + return io.NodeOutput(image) + + scale = execute # TODO: remove -class FluxKontextMultiReferenceLatentMethod: +class FluxKontextMultiReferenceLatentMethod(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "conditioning": ("CONDITIONING", ), - "reference_latents_method": (("offset", "index", "uxo/uno"), ), - }} + def define_schema(cls): + return io.Schema( + node_id="FluxKontextMultiReferenceLatentMethod", + category="advanced/conditioning/flux", + inputs=[ + io.Conditioning.Input("conditioning"), + io.Combo.Input( + "reference_latents_method", + options=["offset", "index", "uxo/uno"], + ), + ], + outputs=[ + io.Conditioning.Output(), + ], + is_experimental=True, + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "append" - EXPERIMENTAL = True - - CATEGORY = "advanced/conditioning/flux" - - def append(self, conditioning, reference_latents_method): + @classmethod + def execute(cls, conditioning, reference_latents_method) -> io.NodeOutput: if "uxo" in reference_latents_method or "uso" in reference_latents_method: reference_latents_method = "uxo" c = node_helpers.conditioning_set_values(conditioning, {"reference_latents_method": reference_latents_method}) - return (c, ) + return io.NodeOutput(c) -NODE_CLASS_MAPPINGS = { - "CLIPTextEncodeFlux": CLIPTextEncodeFlux, - "FluxGuidance": FluxGuidance, - "FluxDisableGuidance": FluxDisableGuidance, - "FluxKontextImageScale": FluxKontextImageScale, - "FluxKontextMultiReferenceLatentMethod": FluxKontextMultiReferenceLatentMethod, -} + append = execute # TODO: remove + + +class FluxExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + CLIPTextEncodeFlux, + FluxGuidance, + FluxDisableGuidance, + FluxKontextImageScale, + FluxKontextMultiReferenceLatentMethod, + ] + + +async def comfy_entrypoint() -> FluxExtension: + return FluxExtension() From 81e4dac107c24b1655babc47c99c33551c96a644 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 10 Oct 2025 02:08:40 +0300 Subject: [PATCH 431/759] convert nodes_upscale_model.py to V3 schema (#10149) --- comfy_extras/nodes_upscale_model.py | 76 +++++++++++++++++++---------- nodes.py | 2 - 2 files changed, 51 insertions(+), 27 deletions(-) diff --git a/comfy_extras/nodes_upscale_model.py b/comfy_extras/nodes_upscale_model.py index 04c948341..4d62b87be 100644 --- a/comfy_extras/nodes_upscale_model.py +++ b/comfy_extras/nodes_upscale_model.py @@ -4,6 +4,8 @@ from comfy import model_management import torch import comfy.utils import folder_paths +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io try: from spandrel_extra_arches import EXTRA_REGISTRY @@ -13,17 +15,23 @@ try: except: pass -class UpscaleModelLoader: +class UpscaleModelLoader(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model_name": (folder_paths.get_filename_list("upscale_models"), ), - }} - RETURN_TYPES = ("UPSCALE_MODEL",) - FUNCTION = "load_model" + def define_schema(cls): + return io.Schema( + node_id="UpscaleModelLoader", + display_name="Load Upscale Model", + category="loaders", + inputs=[ + io.Combo.Input("model_name", options=folder_paths.get_filename_list("upscale_models")), + ], + outputs=[ + io.UpscaleModel.Output(), + ], + ) - CATEGORY = "loaders" - - def load_model(self, model_name): + @classmethod + def execute(cls, model_name) -> io.NodeOutput: model_path = folder_paths.get_full_path_or_raise("upscale_models", model_name) sd = comfy.utils.load_torch_file(model_path, safe_load=True) if "module.layers.0.residual_group.blocks.0.norm1.weight" in sd: @@ -33,21 +41,29 @@ class UpscaleModelLoader: if not isinstance(out, ImageModelDescriptor): raise Exception("Upscale model must be a single-image model.") - return (out, ) + return io.NodeOutput(out) + + load_model = execute # TODO: remove -class ImageUpscaleWithModel: +class ImageUpscaleWithModel(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "upscale_model": ("UPSCALE_MODEL",), - "image": ("IMAGE",), - }} - RETURN_TYPES = ("IMAGE",) - FUNCTION = "upscale" + def define_schema(cls): + return io.Schema( + node_id="ImageUpscaleWithModel", + display_name="Upscale Image (using Model)", + category="image/upscaling", + inputs=[ + io.UpscaleModel.Input("upscale_model"), + io.Image.Input("image"), + ], + outputs=[ + io.Image.Output(), + ], + ) - CATEGORY = "image/upscaling" - - def upscale(self, upscale_model, image): + @classmethod + def execute(cls, upscale_model, image) -> io.NodeOutput: device = model_management.get_torch_device() memory_required = model_management.module_size(upscale_model.model) @@ -75,9 +91,19 @@ class ImageUpscaleWithModel: upscale_model.to("cpu") s = torch.clamp(s.movedim(-3,-1), min=0, max=1.0) - return (s,) + return io.NodeOutput(s) -NODE_CLASS_MAPPINGS = { - "UpscaleModelLoader": UpscaleModelLoader, - "ImageUpscaleWithModel": ImageUpscaleWithModel -} + upscale = execute # TODO: remove + + +class UpscaleModelExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + UpscaleModelLoader, + ImageUpscaleWithModel, + ] + + +async def comfy_entrypoint() -> UpscaleModelExtension: + return UpscaleModelExtension() diff --git a/nodes.py b/nodes.py index 2a2a5f2ad..7cfa8ca14 100644 --- a/nodes.py +++ b/nodes.py @@ -2027,7 +2027,6 @@ NODE_DISPLAY_NAME_MAPPINGS = { "DiffControlNetLoader": "Load ControlNet Model (diff)", "StyleModelLoader": "Load Style Model", "CLIPVisionLoader": "Load CLIP Vision", - "UpscaleModelLoader": "Load Upscale Model", "UNETLoader": "Load Diffusion Model", # Conditioning "CLIPVisionEncode": "CLIP Vision Encode", @@ -2065,7 +2064,6 @@ NODE_DISPLAY_NAME_MAPPINGS = { "LoadImageOutput": "Load Image (from Outputs)", "ImageScale": "Upscale Image", "ImageScaleBy": "Upscale Image By", - "ImageUpscaleWithModel": "Upscale Image (using Model)", "ImageInvert": "Invert Image", "ImagePadForOutpaint": "Pad Image for Outpainting", "ImageBatch": "Batch Images", From cdfc25a1605add750a3b1a83360b84e8e95324c6 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 10 Oct 2025 14:33:51 -0700 Subject: [PATCH 432/759] Fix save audio nodes saving mono audio as stereo. (#10289) --- comfy_extras/nodes_audio.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 1c868fcba..2ed7e0b22 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -142,9 +142,10 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non for key, value in metadata.items(): output_container.metadata[key] = value + layout = 'mono' if waveform.shape[0] == 1 else 'stereo' # Set up the output stream with appropriate properties if format == "opus": - out_stream = output_container.add_stream("libopus", rate=sample_rate) + out_stream = output_container.add_stream("libopus", rate=sample_rate, layout=layout) if quality == "64k": out_stream.bit_rate = 64000 elif quality == "96k": @@ -156,7 +157,7 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non elif quality == "320k": out_stream.bit_rate = 320000 elif format == "mp3": - out_stream = output_container.add_stream("libmp3lame", rate=sample_rate) + out_stream = output_container.add_stream("libmp3lame", rate=sample_rate, layout=layout) if quality == "V0": #TODO i would really love to support V3 and V5 but there doesn't seem to be a way to set the qscale level, the property below is a bool out_stream.codec_context.qscale = 1 @@ -165,9 +166,9 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non elif quality == "320k": out_stream.bit_rate = 320000 else: #format == "flac": - out_stream = output_container.add_stream("flac", rate=sample_rate) + out_stream = output_container.add_stream("flac", rate=sample_rate, layout=layout) - frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[0] == 1 else 'stereo') + frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', layout=layout) frame.sample_rate = sample_rate frame.pts = 0 output_container.mux(out_stream.encode(frame)) From aa895db7e876401eb3b1d2601f49d6f2aee770ca Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 11 Oct 2025 02:17:20 +0300 Subject: [PATCH 433/759] feat(GeminiImage-ApiNode): add aspect_ratio and release version of model (#10255) --- comfy_api_nodes/apis/gemini_api.py | 17 ++++++++++------- comfy_api_nodes/nodes_gemini.py | 24 ++++++++++++++++++------ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/comfy_api_nodes/apis/gemini_api.py b/comfy_api_nodes/apis/gemini_api.py index 138bf035d..2bf28bf93 100644 --- a/comfy_api_nodes/apis/gemini_api.py +++ b/comfy_api_nodes/apis/gemini_api.py @@ -1,19 +1,22 @@ -from __future__ import annotations - -from typing import List, Optional +from typing import Optional from comfy_api_nodes.apis import GeminiGenerationConfig, GeminiContent, GeminiSafetySetting, GeminiSystemInstructionContent, GeminiTool, GeminiVideoMetadata from pydantic import BaseModel +class GeminiImageConfig(BaseModel): + aspectRatio: Optional[str] = None + + class GeminiImageGenerationConfig(GeminiGenerationConfig): - responseModalities: Optional[List[str]] = None + responseModalities: Optional[list[str]] = None + imageConfig: Optional[GeminiImageConfig] = None class GeminiImageGenerateContentRequest(BaseModel): - contents: List[GeminiContent] + contents: list[GeminiContent] generationConfig: Optional[GeminiImageGenerationConfig] = None - safetySettings: Optional[List[GeminiSafetySetting]] = None + safetySettings: Optional[list[GeminiSafetySetting]] = None systemInstruction: Optional[GeminiSystemInstructionContent] = None - tools: Optional[List[GeminiTool]] = None + tools: Optional[list[GeminiTool]] = None videoMetadata: Optional[GeminiVideoMetadata] = None diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 309e9a2d2..c1941cbe9 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -26,7 +26,7 @@ from comfy_api_nodes.apis import ( GeminiPart, GeminiMimeType, ) -from comfy_api_nodes.apis.gemini_api import GeminiImageGenerationConfig, GeminiImageGenerateContentRequest +from comfy_api_nodes.apis.gemini_api import GeminiImageGenerationConfig, GeminiImageGenerateContentRequest, GeminiImageConfig from comfy_api_nodes.apis.client import ( ApiEndpoint, HttpMethod, @@ -63,6 +63,7 @@ class GeminiImageModel(str, Enum): """ gemini_2_5_flash_image_preview = "gemini-2.5-flash-image-preview" + gemini_2_5_flash_image = "gemini-2.5-flash-image" def get_gemini_endpoint( @@ -538,7 +539,7 @@ class GeminiImage(ComfyNodeABC): { "tooltip": "The Gemini model to use for generating responses.", "options": [model.value for model in GeminiImageModel], - "default": GeminiImageModel.gemini_2_5_flash_image_preview.value, + "default": GeminiImageModel.gemini_2_5_flash_image.value, }, ), "seed": ( @@ -579,6 +580,14 @@ class GeminiImage(ComfyNodeABC): # "tooltip": "How many images to generate", # }, # ), + "aspect_ratio": ( + IO.COMBO, + { + "tooltip": "Defaults to matching the output image size to that of your input image, or otherwise generates 1:1 squares.", + "options": ["auto", "1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"], + "default": "auto", + }, + ), }, "hidden": { "auth_token": "AUTH_TOKEN_COMFY_ORG", @@ -600,15 +609,17 @@ class GeminiImage(ComfyNodeABC): images: Optional[IO.IMAGE] = None, files: Optional[list[GeminiPart]] = None, n=1, + aspect_ratio: str = "auto", unique_id: Optional[str] = None, **kwargs, ): - # Validate inputs validate_string(prompt, strip_whitespace=True, min_length=1) - # Create parts list with text prompt as the first part parts: list[GeminiPart] = [create_text_part(prompt)] - # Add other modal parts + if not aspect_ratio: + aspect_ratio = "auto" # for backward compatability with old workflows; to-do remove this in December + image_config = GeminiImageConfig(aspectRatio=aspect_ratio) + if images is not None: image_parts = create_image_parts(images) parts.extend(image_parts) @@ -625,7 +636,8 @@ class GeminiImage(ComfyNodeABC): ), ], generationConfig=GeminiImageGenerationConfig( - responseModalities=["TEXT","IMAGE"] + responseModalities=["TEXT","IMAGE"], + imageConfig=None if aspect_ratio == "auto" else image_config, ) ), auth_kwargs=kwargs, From 14d642acd66973c81a806dc6f0562d89b4ba3506 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 11 Oct 2025 02:21:40 +0300 Subject: [PATCH 434/759] feat(api-nodes): add price extractor feature; small fixes to Kling & Pika nodes (#10284) --- comfy_api_nodes/apis/client.py | 15 ++++++++++++--- comfy_api_nodes/nodes_kling.py | 33 +++++++++++++++++++-------------- comfy_api_nodes/nodes_pika.py | 2 ++ 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py index e08dfb093..d05e1c16a 100644 --- a/comfy_api_nodes/apis/client.py +++ b/comfy_api_nodes/apis/client.py @@ -782,9 +782,11 @@ class PollingOperation(Generic[T, R]): poll_endpoint: ApiEndpoint[EmptyRequest, R], completed_statuses: list[str], failed_statuses: list[str], + *, status_extractor: Callable[[R], Optional[str]], progress_extractor: Callable[[R], Optional[float]] | None = None, result_url_extractor: Callable[[R], Optional[str]] | None = None, + price_extractor: Callable[[R], Optional[float]] | None = None, request: Optional[T] = None, api_base: str | None = None, auth_token: Optional[str] = None, @@ -815,10 +817,12 @@ class PollingOperation(Generic[T, R]): self.status_extractor = status_extractor or (lambda x: getattr(x, "status", None)) self.progress_extractor = progress_extractor self.result_url_extractor = result_url_extractor + self.price_extractor = price_extractor self.node_id = node_id self.completed_statuses = completed_statuses self.failed_statuses = failed_statuses self.final_response: Optional[R] = None + self.extracted_price: Optional[float] = None async def execute(self, client: Optional[ApiClient] = None) -> R: owns_client = client is None @@ -840,6 +844,8 @@ class PollingOperation(Generic[T, R]): def _display_text_on_node(self, text: str): if not self.node_id: return + if self.extracted_price is not None: + text = f"Price: {self.extracted_price}$\n{text}" PromptServer.instance.send_progress_text(text, self.node_id) def _display_time_progress_on_node(self, time_completed: int | float): @@ -877,9 +883,7 @@ class PollingOperation(Generic[T, R]): try: logging.debug("[DEBUG] Polling attempt #%s", poll_count) - request_dict = ( - None if self.request is None else self.request.model_dump(exclude_none=True) - ) + request_dict = None if self.request is None else self.request.model_dump(exclude_none=True) if poll_count == 1: logging.debug( @@ -912,6 +916,11 @@ class PollingOperation(Generic[T, R]): if new_progress is not None: progress.update_absolute(new_progress, total=PROGRESS_BAR_MAX) + if self.price_extractor: + price = self.price_extractor(response_obj) + if price is not None: + self.extracted_price = price + if status == TaskStatus.COMPLETED: message = "Task completed successfully" if self.result_url_extractor: diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index a3cd09786..2117cfa91 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -73,6 +73,7 @@ from comfy_api_nodes.util.validation_utils import ( validate_video_dimensions, validate_video_duration, ) +from comfy_api.input_impl import VideoFromFile from comfy_api.input.basic_types import AudioInput from comfy_api.input.video_types import VideoInput from comfy_api.latest import ComfyExtension, io as comfy_io @@ -511,7 +512,7 @@ async def execute_video_effect( image_1: torch.Tensor, image_2: Optional[torch.Tensor] = None, model_mode: Optional[KlingVideoGenMode] = None, -) -> comfy_io.NodeOutput: +) -> tuple[VideoFromFile, str, str]: if dual_character: request_input_field = KlingDualCharacterEffectInput( model_name=model_name, @@ -562,7 +563,7 @@ async def execute_video_effect( validate_video_result_response(final_response) video = get_video_from_response(final_response) - return comfy_io.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) + return await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration) async def execute_lipsync( @@ -1271,7 +1272,7 @@ class KlingDualCharacterVideoEffectNode(comfy_io.ComfyNode): image_1=image_left, image_2=image_right, ) - return video, duration + return comfy_io.NodeOutput(video, duration) class KlingSingleImageVideoEffectNode(comfy_io.ComfyNode): @@ -1320,17 +1321,21 @@ class KlingSingleImageVideoEffectNode(comfy_io.ComfyNode): model_name: KlingSingleImageEffectModelName, duration: KlingVideoGenDuration, ) -> comfy_io.NodeOutput: - return await execute_video_effect( - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, - dual_character=False, - effect_scene=effect_scene, - model_name=model_name, - duration=duration, - image_1=image, + return comfy_io.NodeOutput( + *( + await execute_video_effect( + auth_kwargs={ + "auth_token": cls.hidden.auth_token_comfy_org, + "comfy_api_key": cls.hidden.api_key_comfy_org, + }, + node_id=cls.hidden.unique_id, + dual_character=False, + effect_scene=effect_scene, + model_name=model_name, + duration=duration, + image_1=image, + ) + ) ) diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py index 10f11666d..822cfee64 100644 --- a/comfy_api_nodes/nodes_pika.py +++ b/comfy_api_nodes/nodes_pika.py @@ -17,6 +17,7 @@ from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoIn from comfy_api_nodes.apinode_utils import ( download_url_to_video_output, tensor_to_bytesio, + validate_string, ) from comfy_api_nodes.apis import pika_defs from comfy_api_nodes.apis.client import ( @@ -590,6 +591,7 @@ class PikaStartEndFrameNode(comfy_io.ComfyNode): resolution: str, duration: int, ) -> comfy_io.NodeOutput: + validate_string(prompt_text, field_name="prompt_text", min_length=1) pika_files = [ ("keyFrames", ("image_start.png", tensor_to_bytesio(image_start), "image/png")), ("keyFrames", ("image_end.png", tensor_to_bytesio(image_end), "image/png")), From f43b8ab2a2eda034651187222829f72aa82eae6c Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sun, 12 Oct 2025 01:27:22 +0800 Subject: [PATCH 435/759] Update template to 0.1.95 (#10294) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d4594df39..9e0a5e0de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.27.10 -comfyui-workflow-templates==0.1.94 +comfyui-workflow-templates==0.1.95 comfyui-embedded-docs==0.2.6 torch torchsde From 84e9ce32c6d9d340404ee0798a426dae52bbee8b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 11 Oct 2025 19:57:23 -0700 Subject: [PATCH 436/759] Implement the mmaudio VAE. (#10300) --- comfy/ldm/mmaudio/vae/__init__.py | 0 comfy/ldm/mmaudio/vae/activations.py | 120 ++++++++ comfy/ldm/mmaudio/vae/alias_free_torch.py | 157 ++++++++++ comfy/ldm/mmaudio/vae/autoencoder.py | 156 ++++++++++ comfy/ldm/mmaudio/vae/bigvgan.py | 219 +++++++++++++ comfy/ldm/mmaudio/vae/distributions.py | 92 ++++++ comfy/ldm/mmaudio/vae/vae.py | 358 ++++++++++++++++++++++ comfy/ldm/mmaudio/vae/vae_modules.py | 121 ++++++++ comfy/sd.py | 24 ++ 9 files changed, 1247 insertions(+) create mode 100644 comfy/ldm/mmaudio/vae/__init__.py create mode 100644 comfy/ldm/mmaudio/vae/activations.py create mode 100644 comfy/ldm/mmaudio/vae/alias_free_torch.py create mode 100644 comfy/ldm/mmaudio/vae/autoencoder.py create mode 100644 comfy/ldm/mmaudio/vae/bigvgan.py create mode 100644 comfy/ldm/mmaudio/vae/distributions.py create mode 100644 comfy/ldm/mmaudio/vae/vae.py create mode 100644 comfy/ldm/mmaudio/vae/vae_modules.py diff --git a/comfy/ldm/mmaudio/vae/__init__.py b/comfy/ldm/mmaudio/vae/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/comfy/ldm/mmaudio/vae/activations.py b/comfy/ldm/mmaudio/vae/activations.py new file mode 100644 index 000000000..db9192e3e --- /dev/null +++ b/comfy/ldm/mmaudio/vae/activations.py @@ -0,0 +1,120 @@ +# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. +# LICENSE is in incl_licenses directory. + +import torch +from torch import nn, sin, pow +from torch.nn import Parameter +import comfy.model_management + +class Snake(nn.Module): + ''' + Implementation of a sine-based periodic activation function + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter + References: + - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snake(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha: trainable parameter + alpha is initialized to 1 by default, higher values = higher-frequency. + alpha will be trained along with the rest of your model. + ''' + super(Snake, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: + self.alpha = Parameter(torch.empty(in_features)) + else: + self.alpha = Parameter(torch.empty(in_features)) + + self.alpha.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + Snake ∶= x + 1/a * sin^2 (xa) + ''' + alpha = comfy.model_management.cast_to(self.alpha, dtype=x.dtype, device=x.device).unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + if self.alpha_logscale: + alpha = torch.exp(alpha) + x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + + return x + + +class SnakeBeta(nn.Module): + ''' + A modified Snake function which uses separate parameters for the magnitude of the periodic components + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + References: + - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snakebeta(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + alpha is initialized to 1 by default, higher values = higher-frequency. + beta is initialized to 1 by default, higher values = higher-magnitude. + alpha will be trained along with the rest of your model. + ''' + super(SnakeBeta, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: + self.alpha = Parameter(torch.empty(in_features)) + self.beta = Parameter(torch.empty(in_features)) + else: + self.alpha = Parameter(torch.empty(in_features)) + self.beta = Parameter(torch.empty(in_features)) + + self.alpha.requires_grad = alpha_trainable + self.beta.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + SnakeBeta ∶= x + 1/b * sin^2 (xa) + ''' + alpha = comfy.model_management.cast_to(self.alpha, dtype=x.dtype, device=x.device).unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + beta = comfy.model_management.cast_to(self.beta, dtype=x.dtype, device=x.device).unsqueeze(0).unsqueeze(-1) + if self.alpha_logscale: + alpha = torch.exp(alpha) + beta = torch.exp(beta) + x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + + return x diff --git a/comfy/ldm/mmaudio/vae/alias_free_torch.py b/comfy/ldm/mmaudio/vae/alias_free_torch.py new file mode 100644 index 000000000..35c70b897 --- /dev/null +++ b/comfy/ldm/mmaudio/vae/alias_free_torch.py @@ -0,0 +1,157 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +import comfy.model_management + +if 'sinc' in dir(torch): + sinc = torch.sinc +else: + # This code is adopted from adefossez's julius.core.sinc under the MIT License + # https://adefossez.github.io/julius/julius/core.html + # LICENSE is in incl_licenses directory. + def sinc(x: torch.Tensor): + """ + Implementation of sinc, i.e. sin(pi * x) / (pi * x) + __Warning__: Different to julius.sinc, the input is multiplied by `pi`! + """ + return torch.where(x == 0, + torch.tensor(1., device=x.device, dtype=x.dtype), + torch.sin(math.pi * x) / math.pi / x) + + +# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License +# https://adefossez.github.io/julius/julius/lowpass.html +# LICENSE is in incl_licenses directory. +def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] + even = (kernel_size % 2 == 0) + half_size = kernel_size // 2 + + #For kaiser window + delta_f = 4 * half_width + A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 + if A > 50.: + beta = 0.1102 * (A - 8.7) + elif A >= 21.: + beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.) + else: + beta = 0. + window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) + + # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio + if even: + time = (torch.arange(-half_size, half_size) + 0.5) + else: + time = torch.arange(kernel_size) - half_size + if cutoff == 0: + filter_ = torch.zeros_like(time) + else: + filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) + # Normalize filter to have sum = 1, otherwise we will have a small leakage + # of the constant component in the input signal. + filter_ /= filter_.sum() + filter = filter_.view(1, 1, kernel_size) + + return filter + + +class LowPassFilter1d(nn.Module): + def __init__(self, + cutoff=0.5, + half_width=0.6, + stride: int = 1, + padding: bool = True, + padding_mode: str = 'replicate', + kernel_size: int = 12): + # kernel_size should be even number for stylegan3 setup, + # in this implementation, odd number is also possible. + super().__init__() + if cutoff < -0.: + raise ValueError("Minimum cutoff must be larger than zero.") + if cutoff > 0.5: + raise ValueError("A cutoff above 0.5 does not make sense.") + self.kernel_size = kernel_size + self.even = (kernel_size % 2 == 0) + self.pad_left = kernel_size // 2 - int(self.even) + self.pad_right = kernel_size // 2 + self.stride = stride + self.padding = padding + self.padding_mode = padding_mode + filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) + self.register_buffer("filter", filter) + + #input [B, C, T] + def forward(self, x): + _, C, _ = x.shape + + if self.padding: + x = F.pad(x, (self.pad_left, self.pad_right), + mode=self.padding_mode) + out = F.conv1d(x, comfy.model_management.cast_to(self.filter.expand(C, -1, -1), dtype=x.dtype, device=x.device), + stride=self.stride, groups=C) + + return out + + +class UpSample1d(nn.Module): + def __init__(self, ratio=2, kernel_size=None): + super().__init__() + self.ratio = ratio + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size + self.stride = ratio + self.pad = self.kernel_size // ratio - 1 + self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 + self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 + filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, + half_width=0.6 / ratio, + kernel_size=self.kernel_size) + self.register_buffer("filter", filter) + + # x: [B, C, T] + def forward(self, x): + _, C, _ = x.shape + + x = F.pad(x, (self.pad, self.pad), mode='replicate') + x = self.ratio * F.conv_transpose1d( + x, comfy.model_management.cast_to(self.filter.expand(C, -1, -1), dtype=x.dtype, device=x.device), stride=self.stride, groups=C) + x = x[..., self.pad_left:-self.pad_right] + + return x + + +class DownSample1d(nn.Module): + def __init__(self, ratio=2, kernel_size=None): + super().__init__() + self.ratio = ratio + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size + self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio, + half_width=0.6 / ratio, + stride=ratio, + kernel_size=self.kernel_size) + + def forward(self, x): + xx = self.lowpass(x) + + return xx + +class Activation1d(nn.Module): + def __init__(self, + activation, + up_ratio: int = 2, + down_ratio: int = 2, + up_kernel_size: int = 12, + down_kernel_size: int = 12): + super().__init__() + self.up_ratio = up_ratio + self.down_ratio = down_ratio + self.act = activation + self.upsample = UpSample1d(up_ratio, up_kernel_size) + self.downsample = DownSample1d(down_ratio, down_kernel_size) + + # x: [B,C,T] + def forward(self, x): + x = self.upsample(x) + x = self.act(x) + x = self.downsample(x) + + return x diff --git a/comfy/ldm/mmaudio/vae/autoencoder.py b/comfy/ldm/mmaudio/vae/autoencoder.py new file mode 100644 index 000000000..cbb9de302 --- /dev/null +++ b/comfy/ldm/mmaudio/vae/autoencoder.py @@ -0,0 +1,156 @@ +from typing import Literal + +import torch +import torch.nn as nn + +from .distributions import DiagonalGaussianDistribution +from .vae import VAE_16k +from .bigvgan import BigVGANVocoder +import logging + +try: + import torchaudio +except: + logging.warning("torchaudio missing, MMAudio VAE model will be broken") + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5, *, norm_fn): + return norm_fn(torch.clamp(x, min=clip_val) * C) + + +def spectral_normalize_torch(magnitudes, norm_fn): + output = dynamic_range_compression_torch(magnitudes, norm_fn=norm_fn) + return output + +class MelConverter(nn.Module): + + def __init__( + self, + *, + sampling_rate: float, + n_fft: int, + num_mels: int, + hop_size: int, + win_size: int, + fmin: float, + fmax: float, + norm_fn, + ): + super().__init__() + self.sampling_rate = sampling_rate + self.n_fft = n_fft + self.num_mels = num_mels + self.hop_size = hop_size + self.win_size = win_size + self.fmin = fmin + self.fmax = fmax + self.norm_fn = norm_fn + + # mel = librosa_mel_fn(sr=self.sampling_rate, + # n_fft=self.n_fft, + # n_mels=self.num_mels, + # fmin=self.fmin, + # fmax=self.fmax) + # mel_basis = torch.from_numpy(mel).float() + mel_basis = torch.empty((num_mels, 1 + n_fft // 2)) + hann_window = torch.hann_window(self.win_size) + + self.register_buffer('mel_basis', mel_basis) + self.register_buffer('hann_window', hann_window) + + @property + def device(self): + return self.mel_basis.device + + def forward(self, waveform: torch.Tensor, center: bool = False) -> torch.Tensor: + waveform = waveform.clamp(min=-1., max=1.).to(self.device) + + waveform = torch.nn.functional.pad( + waveform.unsqueeze(1), + [int((self.n_fft - self.hop_size) / 2), + int((self.n_fft - self.hop_size) / 2)], + mode='reflect') + waveform = waveform.squeeze(1) + + spec = torch.stft(waveform, + self.n_fft, + hop_length=self.hop_size, + win_length=self.win_size, + window=self.hann_window, + center=center, + pad_mode='reflect', + normalized=False, + onesided=True, + return_complex=True) + + spec = torch.view_as_real(spec) + spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) + spec = torch.matmul(self.mel_basis, spec) + spec = spectral_normalize_torch(spec, self.norm_fn) + + return spec + +class AudioAutoencoder(nn.Module): + + def __init__( + self, + *, + # ckpt_path: str, + mode=Literal['16k', '44k'], + need_vae_encoder: bool = True, + ): + super().__init__() + + assert mode == "16k", "Only 16k mode is supported currently." + self.mel_converter = MelConverter(sampling_rate=16_000, + n_fft=1024, + num_mels=80, + hop_size=256, + win_size=1024, + fmin=0, + fmax=8_000, + norm_fn=torch.log10) + + self.vae = VAE_16k().eval() + + bigvgan_config = { + "resblock": "1", + "num_mels": 80, + "upsample_rates": [4, 4, 2, 2, 2, 2], + "upsample_kernel_sizes": [8, 8, 4, 4, 4, 4], + "upsample_initial_channel": 1536, + "resblock_kernel_sizes": [3, 7, 11], + "resblock_dilation_sizes": [ + [1, 3, 5], + [1, 3, 5], + [1, 3, 5], + ], + "activation": "snakebeta", + "snake_logscale": True, + } + + self.vocoder = BigVGANVocoder( + bigvgan_config + ).eval() + + @torch.inference_mode() + def encode_audio(self, x) -> DiagonalGaussianDistribution: + # x: (B * L) + mel = self.mel_converter(x) + dist = self.vae.encode(mel) + + return dist + + @torch.no_grad() + def decode(self, z): + mel_decoded = self.vae.decode(z) + audio = self.vocoder(mel_decoded) + + audio = torchaudio.functional.resample(audio, 16000, 44100) + return audio + + @torch.no_grad() + def encode(self, audio): + audio = audio.mean(dim=1) + audio = torchaudio.functional.resample(audio, 44100, 16000) + dist = self.encode_audio(audio) + return dist.mean diff --git a/comfy/ldm/mmaudio/vae/bigvgan.py b/comfy/ldm/mmaudio/vae/bigvgan.py new file mode 100644 index 000000000..3a24337f6 --- /dev/null +++ b/comfy/ldm/mmaudio/vae/bigvgan.py @@ -0,0 +1,219 @@ +# Copyright (c) 2022 NVIDIA CORPORATION. +# Licensed under the MIT license. + +# Adapted from https://github.com/jik876/hifi-gan under the MIT license. +# LICENSE is in incl_licenses directory. + +import torch +import torch.nn as nn +from types import SimpleNamespace +from . import activations +from .alias_free_torch import Activation1d +import comfy.ops +ops = comfy.ops.disable_weight_init + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + +class AMPBlock1(torch.nn.Module): + + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None): + super(AMPBlock1, self).__init__() + self.h = h + + self.convs1 = nn.ModuleList([ + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0])), + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1])), + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2])) + ]) + + self.convs2 = nn.ModuleList([ + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1)), + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1)), + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1)) + ]) + + self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers + + if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + else: + raise NotImplementedError( + "activation incorrectly specified. check the config file and look for 'activation'." + ) + + def forward(self, x): + acts1, acts2 = self.activations[::2], self.activations[1::2] + for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2): + xt = a1(x) + xt = c1(xt) + xt = a2(xt) + xt = c2(xt) + x = xt + x + + return x + + +class AMPBlock2(torch.nn.Module): + + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None): + super(AMPBlock2, self).__init__() + self.h = h + + self.convs = nn.ModuleList([ + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0])), + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1])) + ]) + + self.num_layers = len(self.convs) # total number of conv layers + + if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + else: + raise NotImplementedError( + "activation incorrectly specified. check the config file and look for 'activation'." + ) + + def forward(self, x): + for c, a in zip(self.convs, self.activations): + xt = a(x) + xt = c(xt) + x = xt + x + + return x + + +class BigVGANVocoder(torch.nn.Module): + # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks. + def __init__(self, h): + super().__init__() + if isinstance(h, dict): + h = SimpleNamespace(**h) + self.h = h + + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + + # pre conv + self.conv_pre = ops.Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3) + + # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default + resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2 + + # transposed conv-based upsamplers. does not apply anti-aliasing + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + self.ups.append( + nn.ModuleList([ + ops.ConvTranspose1d(h.upsample_initial_channel // (2**i), + h.upsample_initial_channel // (2**(i + 1)), + k, + u, + padding=(k - u) // 2) + ])) + + # residual blocks using anti-aliased multi-periodicity composition modules (AMP) + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2**(i + 1)) + for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock(h, ch, k, d, activation=h.activation)) + + # post conv + if h.activation == "snake": # periodic nonlinearity with snake function and anti-aliasing + activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale) + self.activation_post = Activation1d(activation=activation_post) + elif h.activation == "snakebeta": # periodic nonlinearity with snakebeta function and anti-aliasing + activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale) + self.activation_post = Activation1d(activation=activation_post) + else: + raise NotImplementedError( + "activation incorrectly specified. check the config file and look for 'activation'." + ) + + self.conv_post = ops.Conv1d(ch, 1, 7, 1, padding=3) + + + def forward(self, x): + # pre conv + x = self.conv_pre(x) + + for i in range(self.num_upsamples): + # upsampling + for i_up in range(len(self.ups[i])): + x = self.ups[i][i_up](x) + # AMP blocks + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + + # post conv + x = self.activation_post(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x diff --git a/comfy/ldm/mmaudio/vae/distributions.py b/comfy/ldm/mmaudio/vae/distributions.py new file mode 100644 index 000000000..df987c5ec --- /dev/null +++ b/comfy/ldm/mmaudio/vae/distributions.py @@ -0,0 +1,92 @@ +import torch +import numpy as np + + +class AbstractDistribution: + def sample(self): + raise NotImplementedError() + + def mode(self): + raise NotImplementedError() + + +class DiracDistribution(AbstractDistribution): + def __init__(self, value): + self.value = value + + def sample(self): + return self.value + + def mode(self): + return self.value + + +class DiagonalGaussianDistribution(object): + def __init__(self, parameters, deterministic=False): + self.parameters = parameters + self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.deterministic = deterministic + self.std = torch.exp(0.5 * self.logvar) + self.var = torch.exp(self.logvar) + if self.deterministic: + self.var = self.std = torch.zeros_like(self.mean, device=self.parameters.device) + + def sample(self): + x = self.mean + self.std * torch.randn(self.mean.shape, device=self.parameters.device) + return x + + def kl(self, other=None): + if self.deterministic: + return torch.Tensor([0.]) + else: + if other is None: + return 0.5 * torch.sum(torch.pow(self.mean, 2) + + self.var - 1.0 - self.logvar, + dim=[1, 2, 3]) + else: + return 0.5 * torch.sum( + torch.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var - 1.0 - self.logvar + other.logvar, + dim=[1, 2, 3]) + + def nll(self, sample, dims=[1,2,3]): + if self.deterministic: + return torch.Tensor([0.]) + logtwopi = np.log(2.0 * np.pi) + return 0.5 * torch.sum( + logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, + dim=dims) + + def mode(self): + return self.mean + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12 + Compute the KL divergence between two gaussians. + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, torch.Tensor): + tensor = obj + break + assert tensor is not None, "at least one argument must be a Tensor" + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for torch.exp(). + logvar1, logvar2 = [ + x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) + for x in (logvar1, logvar2) + ] + + return 0.5 * ( + -1.0 + + logvar2 + - logvar1 + + torch.exp(logvar1 - logvar2) + + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) + ) diff --git a/comfy/ldm/mmaudio/vae/vae.py b/comfy/ldm/mmaudio/vae/vae.py new file mode 100644 index 000000000..62f24606c --- /dev/null +++ b/comfy/ldm/mmaudio/vae/vae.py @@ -0,0 +1,358 @@ +import logging +from typing import Optional + +import torch +import torch.nn as nn + +from .vae_modules import (AttnBlock1D, Downsample1D, ResnetBlock1D, + Upsample1D, nonlinearity) +from .distributions import DiagonalGaussianDistribution + +import comfy.ops +ops = comfy.ops.disable_weight_init + +log = logging.getLogger() + +DATA_MEAN_80D = [ + -1.6058, -1.3676, -1.2520, -1.2453, -1.2078, -1.2224, -1.2419, -1.2439, -1.2922, -1.2927, + -1.3170, -1.3543, -1.3401, -1.3836, -1.3907, -1.3912, -1.4313, -1.4152, -1.4527, -1.4728, + -1.4568, -1.5101, -1.5051, -1.5172, -1.5623, -1.5373, -1.5746, -1.5687, -1.6032, -1.6131, + -1.6081, -1.6331, -1.6489, -1.6489, -1.6700, -1.6738, -1.6953, -1.6969, -1.7048, -1.7280, + -1.7361, -1.7495, -1.7658, -1.7814, -1.7889, -1.8064, -1.8221, -1.8377, -1.8417, -1.8643, + -1.8857, -1.8929, -1.9173, -1.9379, -1.9531, -1.9673, -1.9824, -2.0042, -2.0215, -2.0436, + -2.0766, -2.1064, -2.1418, -2.1855, -2.2319, -2.2767, -2.3161, -2.3572, -2.3954, -2.4282, + -2.4659, -2.5072, -2.5552, -2.6074, -2.6584, -2.7107, -2.7634, -2.8266, -2.8981, -2.9673 +] + +DATA_STD_80D = [ + 1.0291, 1.0411, 1.0043, 0.9820, 0.9677, 0.9543, 0.9450, 0.9392, 0.9343, 0.9297, 0.9276, 0.9263, + 0.9242, 0.9254, 0.9232, 0.9281, 0.9263, 0.9315, 0.9274, 0.9247, 0.9277, 0.9199, 0.9188, 0.9194, + 0.9160, 0.9161, 0.9146, 0.9161, 0.9100, 0.9095, 0.9145, 0.9076, 0.9066, 0.9095, 0.9032, 0.9043, + 0.9038, 0.9011, 0.9019, 0.9010, 0.8984, 0.8983, 0.8986, 0.8961, 0.8962, 0.8978, 0.8962, 0.8973, + 0.8993, 0.8976, 0.8995, 0.9016, 0.8982, 0.8972, 0.8974, 0.8949, 0.8940, 0.8947, 0.8936, 0.8939, + 0.8951, 0.8956, 0.9017, 0.9167, 0.9436, 0.9690, 1.0003, 1.0225, 1.0381, 1.0491, 1.0545, 1.0604, + 1.0761, 1.0929, 1.1089, 1.1196, 1.1176, 1.1156, 1.1117, 1.1070 +] + +DATA_MEAN_128D = [ + -3.3462, -2.6723, -2.4893, -2.3143, -2.2664, -2.3317, -2.1802, -2.4006, -2.2357, -2.4597, + -2.3717, -2.4690, -2.5142, -2.4919, -2.6610, -2.5047, -2.7483, -2.5926, -2.7462, -2.7033, + -2.7386, -2.8112, -2.7502, -2.9594, -2.7473, -3.0035, -2.8891, -2.9922, -2.9856, -3.0157, + -3.1191, -2.9893, -3.1718, -3.0745, -3.1879, -3.2310, -3.1424, -3.2296, -3.2791, -3.2782, + -3.2756, -3.3134, -3.3509, -3.3750, -3.3951, -3.3698, -3.4505, -3.4509, -3.5089, -3.4647, + -3.5536, -3.5788, -3.5867, -3.6036, -3.6400, -3.6747, -3.7072, -3.7279, -3.7283, -3.7795, + -3.8259, -3.8447, -3.8663, -3.9182, -3.9605, -3.9861, -4.0105, -4.0373, -4.0762, -4.1121, + -4.1488, -4.1874, -4.2461, -4.3170, -4.3639, -4.4452, -4.5282, -4.6297, -4.7019, -4.7960, + -4.8700, -4.9507, -5.0303, -5.0866, -5.1634, -5.2342, -5.3242, -5.4053, -5.4927, -5.5712, + -5.6464, -5.7052, -5.7619, -5.8410, -5.9188, -6.0103, -6.0955, -6.1673, -6.2362, -6.3120, + -6.3926, -6.4797, -6.5565, -6.6511, -6.8130, -6.9961, -7.1275, -7.2457, -7.3576, -7.4663, + -7.6136, -7.7469, -7.8815, -8.0132, -8.1515, -8.3071, -8.4722, -8.7418, -9.3975, -9.6628, + -9.7671, -9.8863, -9.9992, -10.0860, -10.1709, -10.5418, -11.2795, -11.3861 +] + +DATA_STD_128D = [ + 2.3804, 2.4368, 2.3772, 2.3145, 2.2803, 2.2510, 2.2316, 2.2083, 2.1996, 2.1835, 2.1769, 2.1659, + 2.1631, 2.1618, 2.1540, 2.1606, 2.1571, 2.1567, 2.1612, 2.1579, 2.1679, 2.1683, 2.1634, 2.1557, + 2.1668, 2.1518, 2.1415, 2.1449, 2.1406, 2.1350, 2.1313, 2.1415, 2.1281, 2.1352, 2.1219, 2.1182, + 2.1327, 2.1195, 2.1137, 2.1080, 2.1179, 2.1036, 2.1087, 2.1036, 2.1015, 2.1068, 2.0975, 2.0991, + 2.0902, 2.1015, 2.0857, 2.0920, 2.0893, 2.0897, 2.0910, 2.0881, 2.0925, 2.0873, 2.0960, 2.0900, + 2.0957, 2.0958, 2.0978, 2.0936, 2.0886, 2.0905, 2.0845, 2.0855, 2.0796, 2.0840, 2.0813, 2.0817, + 2.0838, 2.0840, 2.0917, 2.1061, 2.1431, 2.1976, 2.2482, 2.3055, 2.3700, 2.4088, 2.4372, 2.4609, + 2.4731, 2.4847, 2.5072, 2.5451, 2.5772, 2.6147, 2.6529, 2.6596, 2.6645, 2.6726, 2.6803, 2.6812, + 2.6899, 2.6916, 2.6931, 2.6998, 2.7062, 2.7262, 2.7222, 2.7158, 2.7041, 2.7485, 2.7491, 2.7451, + 2.7485, 2.7233, 2.7297, 2.7233, 2.7145, 2.6958, 2.6788, 2.6439, 2.6007, 2.4786, 2.2469, 2.1877, + 2.1392, 2.0717, 2.0107, 1.9676, 1.9140, 1.7102, 0.9101, 0.7164 +] + + +class VAE(nn.Module): + + def __init__( + self, + *, + data_dim: int, + embed_dim: int, + hidden_dim: int, + ): + super().__init__() + + if data_dim == 80: + self.data_mean = nn.Buffer(torch.tensor(DATA_MEAN_80D, dtype=torch.float32)) + self.data_std = nn.Buffer(torch.tensor(DATA_STD_80D, dtype=torch.float32)) + elif data_dim == 128: + self.data_mean = nn.Buffer(torch.tensor(DATA_MEAN_128D, dtype=torch.float32)) + self.data_std = nn.Buffer(torch.tensor(DATA_STD_128D, dtype=torch.float32)) + + self.data_mean = self.data_mean.view(1, -1, 1) + self.data_std = self.data_std.view(1, -1, 1) + + self.encoder = Encoder1D( + dim=hidden_dim, + ch_mult=(1, 2, 4), + num_res_blocks=2, + attn_layers=[3], + down_layers=[0], + in_dim=data_dim, + embed_dim=embed_dim, + ) + self.decoder = Decoder1D( + dim=hidden_dim, + ch_mult=(1, 2, 4), + num_res_blocks=2, + attn_layers=[3], + down_layers=[0], + in_dim=data_dim, + out_dim=data_dim, + embed_dim=embed_dim, + ) + + self.embed_dim = embed_dim + # self.quant_conv = nn.Conv1d(2 * embed_dim, 2 * embed_dim, 1) + # self.post_quant_conv = nn.Conv1d(embed_dim, embed_dim, 1) + + self.initialize_weights() + + def initialize_weights(self): + pass + + def encode(self, x: torch.Tensor, normalize: bool = True) -> DiagonalGaussianDistribution: + if normalize: + x = self.normalize(x) + moments = self.encoder(x) + posterior = DiagonalGaussianDistribution(moments) + return posterior + + def decode(self, z: torch.Tensor, unnormalize: bool = True) -> torch.Tensor: + dec = self.decoder(z) + if unnormalize: + dec = self.unnormalize(dec) + return dec + + def normalize(self, x: torch.Tensor) -> torch.Tensor: + return (x - comfy.model_management.cast_to(self.data_mean, dtype=x.dtype, device=x.device)) / comfy.model_management.cast_to(self.data_std, dtype=x.dtype, device=x.device) + + def unnormalize(self, x: torch.Tensor) -> torch.Tensor: + return x * comfy.model_management.cast_to(self.data_std, dtype=x.dtype, device=x.device) + comfy.model_management.cast_to(self.data_mean, dtype=x.dtype, device=x.device) + + def forward( + self, + x: torch.Tensor, + sample_posterior: bool = True, + rng: Optional[torch.Generator] = None, + normalize: bool = True, + unnormalize: bool = True, + ) -> tuple[torch.Tensor, DiagonalGaussianDistribution]: + + posterior = self.encode(x, normalize=normalize) + if sample_posterior: + z = posterior.sample(rng) + else: + z = posterior.mode() + dec = self.decode(z, unnormalize=unnormalize) + return dec, posterior + + def load_weights(self, src_dict) -> None: + self.load_state_dict(src_dict, strict=True) + + @property + def device(self) -> torch.device: + return next(self.parameters()).device + + def get_last_layer(self): + return self.decoder.conv_out.weight + + def remove_weight_norm(self): + return self + + +class Encoder1D(nn.Module): + + def __init__(self, + *, + dim: int, + ch_mult: tuple[int] = (1, 2, 4, 8), + num_res_blocks: int, + attn_layers: list[int] = [], + down_layers: list[int] = [], + resamp_with_conv: bool = True, + in_dim: int, + embed_dim: int, + double_z: bool = True, + kernel_size: int = 3, + clip_act: float = 256.0): + super().__init__() + self.dim = dim + self.num_layers = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_dim + self.clip_act = clip_act + self.down_layers = down_layers + self.attn_layers = attn_layers + self.conv_in = ops.Conv1d(in_dim, self.dim, kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + + in_ch_mult = (1, ) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + # downsampling + self.down = nn.ModuleList() + for i_level in range(self.num_layers): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = dim * in_ch_mult[i_level] + block_out = dim * ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock1D(in_dim=block_in, + out_dim=block_out, + kernel_size=kernel_size, + use_norm=True)) + block_in = block_out + if i_level in attn_layers: + attn.append(AttnBlock1D(block_in)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level in down_layers: + down.downsample = Downsample1D(block_in, resamp_with_conv) + self.down.append(down) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock1D(in_dim=block_in, + out_dim=block_in, + kernel_size=kernel_size, + use_norm=True) + self.mid.attn_1 = AttnBlock1D(block_in) + self.mid.block_2 = ResnetBlock1D(in_dim=block_in, + out_dim=block_in, + kernel_size=kernel_size, + use_norm=True) + + # end + self.conv_out = ops.Conv1d(block_in, + 2 * embed_dim if double_z else embed_dim, + kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + + self.learnable_gain = nn.Parameter(torch.zeros([])) + + def forward(self, x): + + # downsampling + h = self.conv_in(x) + for i_level in range(self.num_layers): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](h) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + h = h.clamp(-self.clip_act, self.clip_act) + if i_level in self.down_layers: + h = self.down[i_level].downsample(h) + + # middle + h = self.mid.block_1(h) + h = self.mid.attn_1(h) + h = self.mid.block_2(h) + h = h.clamp(-self.clip_act, self.clip_act) + + # end + h = nonlinearity(h) + h = self.conv_out(h) * (self.learnable_gain + 1) + return h + + +class Decoder1D(nn.Module): + + def __init__(self, + *, + dim: int, + out_dim: int, + ch_mult: tuple[int] = (1, 2, 4, 8), + num_res_blocks: int, + attn_layers: list[int] = [], + down_layers: list[int] = [], + kernel_size: int = 3, + resamp_with_conv: bool = True, + in_dim: int, + embed_dim: int, + clip_act: float = 256.0): + super().__init__() + self.ch = dim + self.num_layers = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_dim + self.clip_act = clip_act + self.down_layers = [i + 1 for i in down_layers] # each downlayer add one + + # compute in_ch_mult, block_in and curr_res at lowest res + block_in = dim * ch_mult[self.num_layers - 1] + + # z to block_in + self.conv_in = ops.Conv1d(embed_dim, block_in, kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock1D(in_dim=block_in, out_dim=block_in, use_norm=True) + self.mid.attn_1 = AttnBlock1D(block_in) + self.mid.block_2 = ResnetBlock1D(in_dim=block_in, out_dim=block_in, use_norm=True) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_layers)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = dim * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append(ResnetBlock1D(in_dim=block_in, out_dim=block_out, use_norm=True)) + block_in = block_out + if i_level in attn_layers: + attn.append(AttnBlock1D(block_in)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level in self.down_layers: + up.upsample = Upsample1D(block_in, resamp_with_conv) + self.up.insert(0, up) # prepend to get consistent order + + # end + self.conv_out = ops.Conv1d(block_in, out_dim, kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + self.learnable_gain = nn.Parameter(torch.zeros([])) + + def forward(self, z): + # z to block_in + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h) + h = self.mid.attn_1(h) + h = self.mid.block_2(h) + h = h.clamp(-self.clip_act, self.clip_act) + + # upsampling + for i_level in reversed(range(self.num_layers)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](h) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + h = h.clamp(-self.clip_act, self.clip_act) + if i_level in self.down_layers: + h = self.up[i_level].upsample(h) + + h = nonlinearity(h) + h = self.conv_out(h) * (self.learnable_gain + 1) + return h + + +def VAE_16k(**kwargs) -> VAE: + return VAE(data_dim=80, embed_dim=20, hidden_dim=384, **kwargs) + + +def VAE_44k(**kwargs) -> VAE: + return VAE(data_dim=128, embed_dim=40, hidden_dim=512, **kwargs) + + +def get_my_vae(name: str, **kwargs) -> VAE: + if name == '16k': + return VAE_16k(**kwargs) + if name == '44k': + return VAE_44k(**kwargs) + raise ValueError(f'Unknown model: {name}') + diff --git a/comfy/ldm/mmaudio/vae/vae_modules.py b/comfy/ldm/mmaudio/vae/vae_modules.py new file mode 100644 index 000000000..3ad05134b --- /dev/null +++ b/comfy/ldm/mmaudio/vae/vae_modules.py @@ -0,0 +1,121 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from comfy.ldm.modules.diffusionmodules.model import vae_attention +import math +import comfy.ops +ops = comfy.ops.disable_weight_init + +def nonlinearity(x): + # swish + return torch.nn.functional.silu(x) / 0.596 + +def mp_sum(a, b, t=0.5): + return a.lerp(b, t) / math.sqrt((1 - t)**2 + t**2) + +def normalize(x, dim=None, eps=1e-4): + if dim is None: + dim = list(range(1, x.ndim)) + norm = torch.linalg.vector_norm(x, dim=dim, keepdim=True, dtype=torch.float32) + norm = torch.add(eps, norm, alpha=math.sqrt(norm.numel() / x.numel())) + return x / norm.to(x.dtype) + +class ResnetBlock1D(nn.Module): + + def __init__(self, *, in_dim, out_dim=None, conv_shortcut=False, kernel_size=3, use_norm=True): + super().__init__() + self.in_dim = in_dim + out_dim = in_dim if out_dim is None else out_dim + self.out_dim = out_dim + self.use_conv_shortcut = conv_shortcut + self.use_norm = use_norm + + self.conv1 = ops.Conv1d(in_dim, out_dim, kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + self.conv2 = ops.Conv1d(out_dim, out_dim, kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + if self.in_dim != self.out_dim: + if self.use_conv_shortcut: + self.conv_shortcut = ops.Conv1d(in_dim, out_dim, kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + else: + self.nin_shortcut = ops.Conv1d(in_dim, out_dim, kernel_size=1, padding=0, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + + # pixel norm + if self.use_norm: + x = normalize(x, dim=1) + + h = x + h = nonlinearity(h) + h = self.conv1(h) + + h = nonlinearity(h) + h = self.conv2(h) + + if self.in_dim != self.out_dim: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return mp_sum(x, h, t=0.3) + + +class AttnBlock1D(nn.Module): + + def __init__(self, in_channels, num_heads=1): + super().__init__() + self.in_channels = in_channels + + self.num_heads = num_heads + self.qkv = ops.Conv1d(in_channels, in_channels * 3, kernel_size=1, padding=0, bias=False) + self.proj_out = ops.Conv1d(in_channels, in_channels, kernel_size=1, padding=0, bias=False) + self.optimized_attention = vae_attention() + + def forward(self, x): + h = x + y = self.qkv(h) + y = y.reshape(y.shape[0], -1, 3, y.shape[-1]) + q, k, v = normalize(y, dim=1).unbind(2) + + h = self.optimized_attention(q, k, v) + h = self.proj_out(h) + + return mp_sum(x, h, t=0.3) + + +class Upsample1D(nn.Module): + + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + self.conv = ops.Conv1d(in_channels, in_channels, kernel_size=3, padding=1, bias=False) + + def forward(self, x): + x = F.interpolate(x, scale_factor=2.0, mode='nearest-exact') # support 3D tensor(B,C,T) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample1D(nn.Module): + + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + # no asymmetric padding in torch conv, must do it ourselves + self.conv1 = ops.Conv1d(in_channels, in_channels, kernel_size=1, padding=0, bias=False) + self.conv2 = ops.Conv1d(in_channels, in_channels, kernel_size=1, padding=0, bias=False) + + def forward(self, x): + + if self.with_conv: + x = self.conv1(x) + + x = F.avg_pool1d(x, kernel_size=2, stride=2) + + if self.with_conv: + x = self.conv2(x) + + return x diff --git a/comfy/sd.py b/comfy/sd.py index f2d95f85a..b9c2e995e 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -18,6 +18,7 @@ import comfy.ldm.wan.vae2_2 import comfy.ldm.hunyuan3d.vae import comfy.ldm.ace.vae.music_dcae_pipeline import comfy.ldm.hunyuan_video.vae +import comfy.ldm.mmaudio.vae.autoencoder import comfy.pixel_space_convert import yaml import math @@ -291,6 +292,7 @@ class VAE: self.downscale_index_formula = None self.upscale_index_formula = None self.extra_1d_channel = None + self.crop_input = True if config is None: if "decoder.mid.block_1.mix_factor" in sd: @@ -542,6 +544,25 @@ class VAE: self.latent_channels = 3 self.latent_dim = 2 self.output_channels = 3 + elif "vocoder.activation_post.downsample.lowpass.filter" in sd: #MMAudio VAE + sample_rate = 16000 + if sample_rate == 16000: + mode = '16k' + else: + mode = '44k' + + self.first_stage_model = comfy.ldm.mmaudio.vae.autoencoder.AudioAutoencoder(mode=mode) + self.memory_used_encode = lambda shape, dtype: (30 * shape[2]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (90 * shape[2] * 1411.2) * model_management.dtype_size(dtype) + self.latent_channels = 20 + self.output_channels = 2 + self.upscale_ratio = 512 * (44100 / sample_rate) + self.downscale_ratio = 512 * (44100 / sample_rate) + self.latent_dim = 1 + self.process_output = lambda audio: audio + self.process_input = lambda audio: audio + self.working_dtypes = [torch.float32] + self.crop_input = False else: logging.warning("WARNING: No VAE weights detected, VAE not initalized.") self.first_stage_model = None @@ -575,6 +596,9 @@ class VAE: raise RuntimeError("ERROR: VAE is invalid: None\n\nIf the VAE is from a checkpoint loader node your checkpoint does not contain a valid VAE.") def vae_encode_crop_pixels(self, pixels): + if not self.crop_input: + return pixels + downscale_ratio = self.spacial_compression_encode() dims = pixels.shape[1:-1] From a125cd84b054a57729b5eecab930ca9408719832 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 11 Oct 2025 21:28:01 -0700 Subject: [PATCH 437/759] Improve AMD performance. (#10302) I honestly have no idea why this improves things but it does. --- comfy/model_management.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index c5b817b62..146c00925 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -332,6 +332,7 @@ except: SUPPORT_FP8_OPS = args.supports_fp8_compute try: if is_amd(): + torch.backends.cudnn.enabled = False # Seems to improve things a lot on AMD try: rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2])) except: @@ -925,11 +926,7 @@ def vae_dtype(device=None, allowed_dtypes=[]): if d == torch.float16 and should_use_fp16(device): return d - # NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32 - # slowness still a problem on pytorch nightly 2.9.0.dev20250720+rocm6.4 tested on RDNA3 - # also a problem on RDNA4 except fp32 is also slow there. - # This is due to large bf16 convolutions being extremely slow. - if d == torch.bfloat16 and ((not is_amd()) or amd_min_version(device, min_rdna_version=4)) and should_use_bf16(device): + if d == torch.bfloat16 and should_use_bf16(device): return d return torch.float32 From fdc92863b6dc6d0edff85e6dbb6a2382046c020d Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Mon, 13 Oct 2025 11:32:02 +0800 Subject: [PATCH 438/759] Update node docs to 0.3.0 (#10318) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9e0a5e0de..bbb22364f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ comfyui-frontend-package==1.27.10 comfyui-workflow-templates==0.1.95 -comfyui-embedded-docs==0.2.6 +comfyui-embedded-docs==0.3.0 torch torchsde torchvision From 894837de9ae9efd87ea81a29af66a9c29628ef47 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Sun, 12 Oct 2025 20:35:33 -0700 Subject: [PATCH 439/759] update extra models paths example (#10316) --- extra_model_paths.yaml.example | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/extra_model_paths.yaml.example b/extra_model_paths.yaml.example index b55913a5a..8d576e51d 100644 --- a/extra_model_paths.yaml.example +++ b/extra_model_paths.yaml.example @@ -28,7 +28,9 @@ a111: # # You can use is_default to mark that these folders should be listed first, and used as the default dirs for eg downloads # #is_default: true # checkpoints: models/checkpoints/ -# clip: models/clip/ +# text_encoders: | +# models/text_encoders/ +# models/clip/ # legacy location still supported # clip_vision: models/clip_vision/ # configs: models/configs/ # controlnet: models/controlnet/ @@ -40,6 +42,9 @@ a111: # upscale_models: models/upscale_models/ # vae: models/vae/ +# For a full list of supported keys (style_models, vae_approx, hypernetworks, photomaker, +# model_patches, audio_encoders, classifiers, etc.) see folder_paths.py. + #other_ui: # base_path: path/to/ui # checkpoints: models/checkpoints From d68ece7301c63da11e0b565da0ecc2900c8ea447 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 12 Oct 2025 20:54:41 -0700 Subject: [PATCH 440/759] Update the extra_model_paths.yaml.example (#10319) --- extra_model_paths.yaml.example | 43 ++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/extra_model_paths.yaml.example b/extra_model_paths.yaml.example index 8d576e51d..34df01681 100644 --- a/extra_model_paths.yaml.example +++ b/extra_model_paths.yaml.example @@ -1,25 +1,5 @@ #Rename this to extra_model_paths.yaml and ComfyUI will load it - -#config for a1111 ui -#all you have to do is change the base_path to where yours is installed -a111: - base_path: path/to/stable-diffusion-webui/ - - checkpoints: models/Stable-diffusion - configs: models/Stable-diffusion - vae: models/VAE - loras: | - models/Lora - models/LyCORIS - upscale_models: | - models/ESRGAN - models/RealESRGAN - models/SwinIR - embeddings: embeddings - hypernetworks: models/hypernetworks - controlnet: models/ControlNet - #config for comfyui #your base path should be either an existing comfy install or a central folder where you store all of your models, loras, etc. @@ -41,6 +21,29 @@ a111: # loras: models/loras/ # upscale_models: models/upscale_models/ # vae: models/vae/ +# audio_encoders: models/audio_encoders/ +# model_patches: models/model_patches/ + + +#config for a1111 ui +#all you have to do is uncomment this (remove the #) and change the base_path to where yours is installed + +#a111: +# base_path: path/to/stable-diffusion-webui/ +# checkpoints: models/Stable-diffusion +# configs: models/Stable-diffusion +# vae: models/VAE +# loras: | +# models/Lora +# models/LyCORIS +# upscale_models: | +# models/ESRGAN +# models/RealESRGAN +# models/SwinIR +# embeddings: embeddings +# hypernetworks: models/hypernetworks +# controlnet: models/ControlNet + # For a full list of supported keys (style_models, vae_approx, hypernetworks, photomaker, # model_patches, audio_encoders, classifiers, etc.) see folder_paths.py. From e693e4db6a2df8482599eed348be15f87799b910 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 13 Oct 2025 11:57:27 -0700 Subject: [PATCH 441/759] Always set diffusion model to eval() mode. (#10331) --- comfy/model_base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index b0b9cde7d..8274c7dea 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -138,6 +138,7 @@ class BaseModel(torch.nn.Module): else: operations = model_config.custom_operations self.diffusion_model = unet_model(**unet_config, device=device, operations=operations) + self.diffusion_model.eval() if comfy.model_management.force_channels_last(): self.diffusion_model.to(memory_format=torch.channels_last) logging.debug("using channels last mode for diffusion model") @@ -669,7 +670,6 @@ class Lotus(BaseModel): class StableCascade_C(BaseModel): def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None): super().__init__(model_config, model_type, device=device, unet_model=StageC) - self.diffusion_model.eval().requires_grad_(False) def extra_conds(self, **kwargs): out = {} @@ -698,7 +698,6 @@ class StableCascade_C(BaseModel): class StableCascade_B(BaseModel): def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None): super().__init__(model_config, model_type, device=device, unet_model=StageB) - self.diffusion_model.eval().requires_grad_(False) def extra_conds(self, **kwargs): out = {} From 27ffd12c45d4237338fe8789779313db9bab59f1 Mon Sep 17 00:00:00 2001 From: Daniel Harte Date: Mon, 13 Oct 2025 20:14:52 +0100 Subject: [PATCH 442/759] add indent=4 kwarg to json.dumps() (#10307) --- comfy_extras/nodes_preview_any.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_preview_any.py b/comfy_extras/nodes_preview_any.py index e6805696f..e749fa6ae 100644 --- a/comfy_extras/nodes_preview_any.py +++ b/comfy_extras/nodes_preview_any.py @@ -25,7 +25,7 @@ class PreviewAny(): value = str(source) elif source is not None: try: - value = json.dumps(source) + value = json.dumps(source, indent=4) except Exception: try: value = str(source) From 95ca2e56c82c1c714dba685bd81ebf3f7baf8efa Mon Sep 17 00:00:00 2001 From: rattus128 <46076784+rattus128@users.noreply.github.com> Date: Tue, 14 Oct 2025 05:23:11 +1000 Subject: [PATCH 443/759] WAN2.2: Fix cache VRAM leak on error (#10308) Same change pattern as 7e8dd275c243ad460ed5015d2e13611d81d2a569 applied to WAN2.2 If this suffers an exception (such as a VRAM oom) it will leave the encode() and decode() methods which skips the cleanup of the WAN feature cache. The comfy node cache then ultimately keeps a reference this object which is in turn reffing large tensors from the failed execution. The feature cache is currently setup at a class variable on the encoder/decoder however, the encode and decode functions always clear it on both entry and exit of normal execution. Its likely the design intent is this is usable as a streaming encoder where the input comes in batches, however the functions as they are today don't support that. So simplify by bringing the cache back to local variable, so that if it does VRAM OOM the cache itself is properly garbage when the encode()/decode() functions dissappear from the stack. --- comfy/ldm/wan/vae2_2.py | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/comfy/ldm/wan/vae2_2.py b/comfy/ldm/wan/vae2_2.py index 1f6d584a2..8e1593a54 100644 --- a/comfy/ldm/wan/vae2_2.py +++ b/comfy/ldm/wan/vae2_2.py @@ -657,51 +657,51 @@ class WanVAE(nn.Module): ) def encode(self, x): - self.clear_cache() + conv_idx = [0] + feat_map = [None] * count_conv3d(self.encoder) x = patchify(x, patch_size=2) t = x.shape[2] iter_ = 1 + (t - 1) // 4 for i in range(iter_): - self._enc_conv_idx = [0] + conv_idx = [0] if i == 0: out = self.encoder( x[:, :, :1, :, :], - feat_cache=self._enc_feat_map, - feat_idx=self._enc_conv_idx, + feat_cache=feat_map, + feat_idx=conv_idx, ) else: out_ = self.encoder( x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :], - feat_cache=self._enc_feat_map, - feat_idx=self._enc_conv_idx, + feat_cache=feat_map, + feat_idx=conv_idx, ) out = torch.cat([out, out_], 2) mu, log_var = self.conv1(out).chunk(2, dim=1) - self.clear_cache() return mu def decode(self, z): - self.clear_cache() + conv_idx = [0] + feat_map = [None] * count_conv3d(self.decoder) iter_ = z.shape[2] x = self.conv2(z) for i in range(iter_): - self._conv_idx = [0] + conv_idx = [0] if i == 0: out = self.decoder( x[:, :, i:i + 1, :, :], - feat_cache=self._feat_map, - feat_idx=self._conv_idx, + feat_cache=feat_map, + feat_idx=conv_idx, first_chunk=True, ) else: out_ = self.decoder( x[:, :, i:i + 1, :, :], - feat_cache=self._feat_map, - feat_idx=self._conv_idx, + feat_cache=feat_map, + feat_idx=conv_idx, ) out = torch.cat([out, out_], 2) out = unpatchify(out, patch_size=2) - self.clear_cache() return out def reparameterize(self, mu, log_var): @@ -715,12 +715,3 @@ class WanVAE(nn.Module): return mu std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0)) return mu + std * torch.randn_like(std) - - def clear_cache(self): - self._conv_num = count_conv3d(self.decoder) - self._conv_idx = [0] - self._feat_map = [None] * self._conv_num - # cache encode - self._enc_conv_num = count_conv3d(self.encoder) - self._enc_conv_idx = [0] - self._enc_feat_map = [None] * self._enc_conv_num From 3dfdcf66b643b6c191743d3b30fd8198ce690f2d Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 13 Oct 2025 22:36:26 +0300 Subject: [PATCH 444/759] convert nodes_hunyuan.py to V3 schema (#10136) --- comfy_extras/nodes_hunyuan.py | 241 +++++++++++++++++++++------------- 1 file changed, 150 insertions(+), 91 deletions(-) diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index db398cdf1..f7c34d059 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -2,42 +2,60 @@ import nodes import node_helpers import torch import comfy.model_management +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class CLIPTextEncodeHunyuanDiT: +class CLIPTextEncodeHunyuanDiT(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "bert": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}), - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodeHunyuanDiT", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("bert", multiline=True, dynamic_prompts=True), + io.String.Input("mt5xl", multiline=True, dynamic_prompts=True), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - CATEGORY = "advanced/conditioning" - - def encode(self, clip, bert, mt5xl): + @classmethod + def execute(cls, clip, bert, mt5xl) -> io.NodeOutput: tokens = clip.tokenize(bert) tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"] - return (clip.encode_from_tokens_scheduled(tokens), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens)) -class EmptyHunyuanLatentVideo: + encode = execute # TODO: remove + + +class EmptyHunyuanLatentVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" + def define_schema(cls): + return io.Schema( + node_id="EmptyHunyuanLatentVideo", + category="latent/video", + inputs=[ + io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=25, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(), + ], + ) - CATEGORY = "latent/video" - - def generate(self, width, height, length, batch_size=1): + @classmethod + def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - return ({"samples":latent}, ) + return io.NodeOutput({"samples":latent}) + + generate = execute # TODO: remove + PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = ( "<|start_header_id|>system<|end_header_id|>\n\n\nDescribe the video by detailing the following aspects according to the reference image: " @@ -50,45 +68,61 @@ PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = ( "<|start_header_id|>assistant<|end_header_id|>\n\n" ) -class TextEncodeHunyuanVideo_ImageToVideo: +class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "clip_vision_output": ("CLIP_VISION_OUTPUT", ), - "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}), - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="TextEncodeHunyuanVideo_ImageToVideo", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.ClipVisionOutput.Input("clip_vision_output"), + io.String.Input("prompt", multiline=True, dynamic_prompts=True), + io.Int.Input( + "image_interleave", + default=2, + min=1, + max=512, + tooltip="How much the image influences things vs the text prompt. Higher number means more influence from the text prompt.", + ), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - CATEGORY = "advanced/conditioning" - - def encode(self, clip, clip_vision_output, prompt, image_interleave): + @classmethod + def execute(cls, clip, clip_vision_output, prompt, image_interleave) -> io.NodeOutput: tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave) - return (clip.encode_from_tokens_scheduled(tokens), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens)) -class HunyuanImageToVideo: + encode = execute # TODO: remove + + +class HunyuanImageToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - "guidance_type": (["v1 (concat)", "v2 (replace)", "custom"], ) - }, - "optional": {"start_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="HunyuanImageToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Vae.Input("vae"), + io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=53, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Combo.Input("guidance_type", options=["v1 (concat)", "v2 (replace)", "custom"]), + io.Image.Input("start_image", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None): + @classmethod + def execute(cls, positive, vae, width, height, length, batch_size, guidance_type, start_image=None) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) out_latent = {} @@ -111,51 +145,76 @@ class HunyuanImageToVideo: positive = node_helpers.conditioning_set_values(positive, cond) out_latent["samples"] = latent - return (positive, out_latent) + return io.NodeOutput(positive, out_latent) -class EmptyHunyuanImageLatent: + encode = execute # TODO: remove + + +class EmptyHunyuanImageLatent(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "width": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), - "height": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" + def define_schema(cls): + return io.Schema( + node_id="EmptyHunyuanImageLatent", + category="latent", + inputs=[ + io.Int.Input("width", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("height", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(), + ], + ) - CATEGORY = "latent" - - def generate(self, width, height, batch_size=1): + @classmethod + def execute(cls, width, height, batch_size=1) -> io.NodeOutput: latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device()) - return ({"samples":latent}, ) + return io.NodeOutput({"samples":latent}) -class HunyuanRefinerLatent: + generate = execute # TODO: remove + + +class HunyuanRefinerLatent(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "latent": ("LATENT", ), - "noise_augmentation": ("FLOAT", {"default": 0.10, "min": 0.0, "max": 1.0, "step": 0.01}), - }} + def define_schema(cls): + return io.Schema( + node_id="HunyuanRefinerLatent", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Latent.Input("latent"), + io.Float.Input("noise_augmentation", default=0.10, min=0.0, max=1.0, step=0.01), - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - FUNCTION = "execute" - - def execute(self, positive, negative, latent, noise_augmentation): + @classmethod + def execute(cls, positive, negative, latent, noise_augmentation) -> io.NodeOutput: latent = latent["samples"] positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation}) negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation}) out_latent = {} out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -NODE_CLASS_MAPPINGS = { - "CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT, - "TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo, - "EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo, - "HunyuanImageToVideo": HunyuanImageToVideo, - "EmptyHunyuanImageLatent": EmptyHunyuanImageLatent, - "HunyuanRefinerLatent": HunyuanRefinerLatent, -} +class HunyuanExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + CLIPTextEncodeHunyuanDiT, + TextEncodeHunyuanVideo_ImageToVideo, + EmptyHunyuanLatentVideo, + HunyuanImageToVideo, + EmptyHunyuanImageLatent, + HunyuanRefinerLatent, + ] + + +async def comfy_entrypoint() -> HunyuanExtension: + return HunyuanExtension() From c8674bc6e9c0762e9fabe0e7f2762d5c36700963 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 13 Oct 2025 18:19:03 -0700 Subject: [PATCH 445/759] Enable RDNA4 pytorch attention on ROCm 7.0 and up. (#10332) --- comfy/model_management.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 146c00925..709ebc40b 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -345,9 +345,9 @@ try: if torch_version_numeric >= (2, 7): # works on 2.6 but doesn't actually seem to improve much if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx950 ENABLE_PYTORCH_ATTENTION = True -# if torch_version_numeric >= (2, 8): -# if any((a in arch) for a in ["gfx1201"]): -# ENABLE_PYTORCH_ATTENTION = True + if rocm_version >= (7, 0): + if any((a in arch) for a in ["gfx1201"]): + ENABLE_PYTORCH_ATTENTION = True if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4): if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx942", "gfx950"]): # TODO: more arches SUPPORT_FP8_OPS = True From e4ea3936660a8f8dfa2467e51631362b04ad47e8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 13 Oct 2025 19:18:58 -0700 Subject: [PATCH 446/759] Fix loading old stable diffusion ckpt files on newer numpy. (#10333) --- comfy/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/comfy/utils.py b/comfy/utils.py index fab28cf08..0fd03f165 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -39,7 +39,11 @@ if hasattr(torch.serialization, "add_safe_globals"): # TODO: this was added in pass ModelCheckpoint.__module__ = "pytorch_lightning.callbacks.model_checkpoint" - from numpy.core.multiarray import scalar + def scalar(*args, **kwargs): + from numpy.core.multiarray import scalar as sc + return sc(*args, **kwargs) + scalar.__module__ = "numpy.core.multiarray" + from numpy import dtype from numpy.dtypes import Float64DType from _codecs import encode From dfff7e5332530b7278c1f90c51aed525db53489e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 13 Oct 2025 19:37:19 -0700 Subject: [PATCH 447/759] Better memory estimation for the SD/Flux VAE on AMD. (#10334) --- comfy/sd.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index b9c2e995e..28bee248d 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -276,8 +276,13 @@ class VAE: if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format sd = diffusers_convert.convert_vae_state_dict(sd) - self.memory_used_encode = lambda shape, dtype: (1767 * shape[2] * shape[3]) * model_management.dtype_size(dtype) #These are for AutoencoderKL and need tweaking (should be lower) - self.memory_used_decode = lambda shape, dtype: (2178 * shape[2] * shape[3] * 64) * model_management.dtype_size(dtype) + if model_management.is_amd(): + VAE_KL_MEM_RATIO = 2.73 + else: + VAE_KL_MEM_RATIO = 1.0 + + self.memory_used_encode = lambda shape, dtype: (1767 * shape[2] * shape[3]) * model_management.dtype_size(dtype) * VAE_KL_MEM_RATIO #These are for AutoencoderKL and need tweaking (should be lower) + self.memory_used_decode = lambda shape, dtype: (2178 * shape[2] * shape[3] * 64) * model_management.dtype_size(dtype) * VAE_KL_MEM_RATIO self.downscale_ratio = 8 self.upscale_ratio = 8 self.latent_channels = 4 From 51696e3fdcdfad657cb15854345fbcbbe70eef8d Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 13 Oct 2025 23:39:55 -0400 Subject: [PATCH 448/759] ComfyUI version 0.3.65 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index da5cde02d..d39c1fdc4 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.64" +__version__ = "0.3.65" diff --git a/pyproject.toml b/pyproject.toml index 5dcc49a47..653604e24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.64" +version = "0.3.65" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 3374e900d0f310100ebe54944175a36f287110cb Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 13 Oct 2025 20:43:53 -0700 Subject: [PATCH 449/759] Faster workflow cancelling. (#10301) --- comfy/ops.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/comfy/ops.py b/comfy/ops.py index 2415c96bf..b2096b40e 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -24,6 +24,8 @@ import comfy.float import comfy.rmsnorm import contextlib +def run_every_op(): + comfy.model_management.throw_exception_if_processing_interrupted() def scaled_dot_product_attention(q, k, v, *args, **kwargs): return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) @@ -109,6 +111,7 @@ class disable_weight_init: return torch.nn.functional.linear(input, weight, bias) def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -123,6 +126,7 @@ class disable_weight_init: return self._conv_forward(input, weight, bias) def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -137,6 +141,7 @@ class disable_weight_init: return self._conv_forward(input, weight, bias) def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -151,6 +156,7 @@ class disable_weight_init: return self._conv_forward(input, weight, bias) def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -165,6 +171,7 @@ class disable_weight_init: return torch.nn.functional.group_norm(input, self.num_groups, weight, bias, self.eps) def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -183,6 +190,7 @@ class disable_weight_init: return torch.nn.functional.layer_norm(input, self.normalized_shape, weight, bias, self.eps) def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -202,6 +210,7 @@ class disable_weight_init: # return torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps) def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -223,6 +232,7 @@ class disable_weight_init: output_padding, self.groups, self.dilation) def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -244,6 +254,7 @@ class disable_weight_init: output_padding, self.groups, self.dilation) def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -262,6 +273,7 @@ class disable_weight_init: return torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype) def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: From 84867067ea588e2a3d38a54dc34d86c96d706487 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 13 Oct 2025 23:09:12 -0700 Subject: [PATCH 450/759] Python 3.14 instructions. (#10337) --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a5a17cda..db1fdaf3c 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,9 @@ comfy install ## Manual Install (Windows, Linux) -Python 3.13 is very well supported. If you have trouble with some custom node dependencies you can try 3.12 +Python 3.14 will work if you comment out the `kornia` dependency in the requirements.txt file (breaks the canny node) and install pytorch nightly but it is not recommended. + +Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12 Git clone this repo. From 7a883849ea21003a5a649276a4cd322cb6c2ff0b Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 14 Oct 2025 09:55:56 +0300 Subject: [PATCH 451/759] api-nodes: fixed dynamic pricing format; import comfy_io directly (#10336) --- comfy_api/latest/__init__.py | 7 +- comfy_api_nodes/apinode_utils.py | 14 + comfy_api_nodes/apis/client.py | 2 +- comfy_api_nodes/nodes_bfl.py | 236 +++++++-------- comfy_api_nodes/nodes_bytedance.py | 262 ++++++++--------- comfy_api_nodes/nodes_ideogram.py | 112 ++++---- comfy_api_nodes/nodes_kling.py | 428 ++++++++++++++-------------- comfy_api_nodes/nodes_luma.py | 190 ++++++------ comfy_api_nodes/nodes_minimax.py | 108 +++---- comfy_api_nodes/nodes_moonvalley.py | 118 ++++---- comfy_api_nodes/nodes_pika.py | 188 ++++++------ comfy_api_nodes/nodes_pixverse.py | 120 ++++---- comfy_api_nodes/nodes_rodin.py | 116 ++++---- comfy_api_nodes/nodes_runway.py | 112 ++++---- comfy_api_nodes/nodes_sora.py | 32 +-- comfy_api_nodes/nodes_stability.py | 244 ++++++++-------- comfy_api_nodes/nodes_veo2.py | 74 ++--- comfy_api_nodes/nodes_vidu.py | 144 +++++----- comfy_api_nodes/nodes_wan.py | 146 +++++----- 19 files changed, 1331 insertions(+), 1322 deletions(-) diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py index b19a97f1d..b7a3fa9c1 100644 --- a/comfy_api/latest/__init__.py +++ b/comfy_api/latest/__init__.py @@ -114,7 +114,9 @@ if TYPE_CHECKING: ComfyAPISync: Type[comfy_api.latest.generated.ComfyAPISyncStub.ComfyAPISyncStub] ComfyAPISync = create_sync_class(ComfyAPI_latest) -comfy_io = io # create the new alias for io +# create new aliases for io and ui +IO = io +UI = ui __all__ = [ "ComfyAPI", @@ -124,6 +126,7 @@ __all__ = [ "Types", "ComfyExtension", "io", - "comfy_io", + "IO", "ui", + "UI", ] diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py index 4bab539f7..bc3d2d07e 100644 --- a/comfy_api_nodes/apinode_utils.py +++ b/comfy_api_nodes/apinode_utils.py @@ -3,6 +3,7 @@ import aiohttp import io import logging import mimetypes +import os from typing import Optional, Union from comfy.utils import common_upscale from comfy_api.input_impl import VideoFromFile @@ -702,3 +703,16 @@ def image_tensor_pair_to_batch( "center", ).movedim(1, -1) return torch.cat((image1, image2), dim=0) + + +def get_size(path_or_object: Union[str, io.BytesIO]) -> int: + if isinstance(path_or_object, str): + return os.path.getsize(path_or_object) + return len(path_or_object.getvalue()) + + +def validate_container_format_is_mp4(video: VideoInput) -> None: + """Validates video container format is MP4.""" + container_format = video.get_container_format() + if container_format not in ["mp4", "mov,mp4,m4a,3gp,3g2,mj2"]: + raise ValueError(f"Only MP4 container format supported. Got: {container_format}") diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py index d05e1c16a..bdaddcc88 100644 --- a/comfy_api_nodes/apis/client.py +++ b/comfy_api_nodes/apis/client.py @@ -845,7 +845,7 @@ class PollingOperation(Generic[T, R]): if not self.node_id: return if self.extracted_price is not None: - text = f"Price: {self.extracted_price}$\n{text}" + text = f"Price: ${self.extracted_price}\n{text}" PromptServer.instance.send_progress_text(text, self.node_id) def _display_time_progress_on_node(self, time_completed: int | float): diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py index 77914021d..b6cc90f05 100644 --- a/comfy_api_nodes/nodes_bfl.py +++ b/comfy_api_nodes/nodes_bfl.py @@ -3,7 +3,7 @@ import io from inspect import cleandoc from typing import Union, Optional from typing_extensions import override -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, IO from comfy_api_nodes.apis.bfl_api import ( BFLStatus, BFLFluxExpandImageRequest, @@ -131,7 +131,7 @@ def convert_image_to_base64(image: torch.Tensor): return base64.b64encode(img_byte_arr.getvalue()).decode() -class FluxProUltraImageNode(comfy_io.ComfyNode): +class FluxProUltraImageNode(IO.ComfyNode): """ Generates images using Flux Pro 1.1 Ultra via api based on prompt and resolution. """ @@ -142,25 +142,25 @@ class FluxProUltraImageNode(comfy_io.ComfyNode): MAXIMUM_RATIO_STR = "4:1" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="FluxProUltraImageNode", display_name="Flux 1.1 [pro] Ultra Image", category="api node/image/BFL", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_upsampling", default=False, tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -168,21 +168,21 @@ class FluxProUltraImageNode(comfy_io.ComfyNode): control_after_generate=True, tooltip="The random seed used for creating the noise.", ), - comfy_io.String.Input( + IO.String.Input( "aspect_ratio", default="16:9", tooltip="Aspect ratio of image; must be between 1:4 and 4:1.", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "raw", default=False, tooltip="When True, generate less processed, more natural-looking images.", ), - comfy_io.Image.Input( + IO.Image.Input( "image_prompt", optional=True, ), - comfy_io.Float.Input( + IO.Float.Input( "image_prompt_strength", default=0.1, min=0.0, @@ -192,11 +192,11 @@ class FluxProUltraImageNode(comfy_io.ComfyNode): optional=True, ), ], - outputs=[comfy_io.Image.Output()], + outputs=[IO.Image.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -225,7 +225,7 @@ class FluxProUltraImageNode(comfy_io.ComfyNode): seed=0, image_prompt=None, image_prompt_strength=0.1, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: if image_prompt is None: validate_string(prompt, strip_whitespace=False) operation = SynchronousOperation( @@ -262,10 +262,10 @@ class FluxProUltraImageNode(comfy_io.ComfyNode): }, ) output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return comfy_io.NodeOutput(output_image) + return IO.NodeOutput(output_image) -class FluxKontextProImageNode(comfy_io.ComfyNode): +class FluxKontextProImageNode(IO.ComfyNode): """ Edits images using Flux.1 Kontext [pro] via api based on prompt and aspect ratio. """ @@ -276,25 +276,25 @@ class FluxKontextProImageNode(comfy_io.ComfyNode): MAXIMUM_RATIO_STR = "4:1" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id=cls.NODE_ID, display_name=cls.DISPLAY_NAME, category="api node/image/BFL", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation - specify what and how to edit.", ), - comfy_io.String.Input( + IO.String.Input( "aspect_ratio", default="16:9", tooltip="Aspect ratio of image; must be between 1:4 and 4:1.", ), - comfy_io.Float.Input( + IO.Float.Input( "guidance", default=3.0, min=0.1, @@ -302,14 +302,14 @@ class FluxKontextProImageNode(comfy_io.ComfyNode): step=0.1, tooltip="Guidance strength for the image generation process", ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=50, min=1, max=150, tooltip="Number of steps for the image generation process", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=1234, min=0, @@ -317,21 +317,21 @@ class FluxKontextProImageNode(comfy_io.ComfyNode): control_after_generate=True, tooltip="The random seed used for creating the noise.", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_upsampling", default=False, tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - comfy_io.Image.Input( + IO.Image.Input( "input_image", optional=True, ), ], - outputs=[comfy_io.Image.Output()], + outputs=[IO.Image.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -350,7 +350,7 @@ class FluxKontextProImageNode(comfy_io.ComfyNode): input_image: Optional[torch.Tensor]=None, seed=0, prompt_upsampling=False, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: aspect_ratio = validate_aspect_ratio( aspect_ratio, minimum_ratio=cls.MINIMUM_RATIO, @@ -386,7 +386,7 @@ class FluxKontextProImageNode(comfy_io.ComfyNode): }, ) output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return comfy_io.NodeOutput(output_image) + return IO.NodeOutput(output_image) class FluxKontextMaxImageNode(FluxKontextProImageNode): @@ -400,45 +400,45 @@ class FluxKontextMaxImageNode(FluxKontextProImageNode): DISPLAY_NAME = "Flux.1 Kontext [max] Image" -class FluxProImageNode(comfy_io.ComfyNode): +class FluxProImageNode(IO.ComfyNode): """ Generates images synchronously based on prompt and resolution. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="FluxProImageNode", display_name="Flux 1.1 [pro] Image", category="api node/image/BFL", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_upsampling", default=False, tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - comfy_io.Int.Input( + IO.Int.Input( "width", default=1024, min=256, max=1440, step=32, ), - comfy_io.Int.Input( + IO.Int.Input( "height", default=768, min=256, max=1440, step=32, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -446,7 +446,7 @@ class FluxProImageNode(comfy_io.ComfyNode): control_after_generate=True, tooltip="The random seed used for creating the noise.", ), - comfy_io.Image.Input( + IO.Image.Input( "image_prompt", optional=True, ), @@ -461,11 +461,11 @@ class FluxProImageNode(comfy_io.ComfyNode): # }, # ), ], - outputs=[comfy_io.Image.Output()], + outputs=[IO.Image.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -480,7 +480,7 @@ class FluxProImageNode(comfy_io.ComfyNode): seed=0, image_prompt=None, # image_prompt_strength=0.1, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: image_prompt = ( image_prompt if image_prompt is None @@ -508,77 +508,77 @@ class FluxProImageNode(comfy_io.ComfyNode): }, ) output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return comfy_io.NodeOutput(output_image) + return IO.NodeOutput(output_image) -class FluxProExpandNode(comfy_io.ComfyNode): +class FluxProExpandNode(IO.ComfyNode): """ Outpaints image based on prompt. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="FluxProExpandNode", display_name="Flux.1 Expand Image", category="api node/image/BFL", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("image"), - comfy_io.String.Input( + IO.Image.Input("image"), + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_upsampling", default=False, tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - comfy_io.Int.Input( + IO.Int.Input( "top", default=0, min=0, max=2048, tooltip="Number of pixels to expand at the top of the image", ), - comfy_io.Int.Input( + IO.Int.Input( "bottom", default=0, min=0, max=2048, tooltip="Number of pixels to expand at the bottom of the image", ), - comfy_io.Int.Input( + IO.Int.Input( "left", default=0, min=0, max=2048, tooltip="Number of pixels to expand at the left of the image", ), - comfy_io.Int.Input( + IO.Int.Input( "right", default=0, min=0, max=2048, tooltip="Number of pixels to expand at the right of the image", ), - comfy_io.Float.Input( + IO.Float.Input( "guidance", default=60, min=1.5, max=100, tooltip="Guidance strength for the image generation process", ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=50, min=15, max=50, tooltip="Number of steps for the image generation process", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -587,11 +587,11 @@ class FluxProExpandNode(comfy_io.ComfyNode): tooltip="The random seed used for creating the noise.", ), ], - outputs=[comfy_io.Image.Output()], + outputs=[IO.Image.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -609,7 +609,7 @@ class FluxProExpandNode(comfy_io.ComfyNode): steps: int, guidance: float, seed=0, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: image = convert_image_to_base64(image) operation = SynchronousOperation( @@ -637,51 +637,51 @@ class FluxProExpandNode(comfy_io.ComfyNode): }, ) output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return comfy_io.NodeOutput(output_image) + return IO.NodeOutput(output_image) -class FluxProFillNode(comfy_io.ComfyNode): +class FluxProFillNode(IO.ComfyNode): """ Inpaints image based on mask and prompt. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="FluxProFillNode", display_name="Flux.1 Fill Image", category="api node/image/BFL", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("image"), - comfy_io.Mask.Input("mask"), - comfy_io.String.Input( + IO.Image.Input("image"), + IO.Mask.Input("mask"), + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_upsampling", default=False, tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - comfy_io.Float.Input( + IO.Float.Input( "guidance", default=60, min=1.5, max=100, tooltip="Guidance strength for the image generation process", ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=50, min=15, max=50, tooltip="Number of steps for the image generation process", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -690,11 +690,11 @@ class FluxProFillNode(comfy_io.ComfyNode): tooltip="The random seed used for creating the noise.", ), ], - outputs=[comfy_io.Image.Output()], + outputs=[IO.Image.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -709,7 +709,7 @@ class FluxProFillNode(comfy_io.ComfyNode): steps: int, guidance: float, seed=0, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: # prepare mask mask = resize_mask_to_image(mask, image) mask = convert_image_to_base64(convert_mask_to_image(mask)) @@ -738,35 +738,35 @@ class FluxProFillNode(comfy_io.ComfyNode): }, ) output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return comfy_io.NodeOutput(output_image) + return IO.NodeOutput(output_image) -class FluxProCannyNode(comfy_io.ComfyNode): +class FluxProCannyNode(IO.ComfyNode): """ Generate image using a control image (canny). """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="FluxProCannyNode", display_name="Flux.1 Canny Control Image", category="api node/image/BFL", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("control_image"), - comfy_io.String.Input( + IO.Image.Input("control_image"), + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_upsampling", default=False, tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - comfy_io.Float.Input( + IO.Float.Input( "canny_low_threshold", default=0.1, min=0.01, @@ -774,7 +774,7 @@ class FluxProCannyNode(comfy_io.ComfyNode): step=0.01, tooltip="Low threshold for Canny edge detection; ignored if skip_processing is True", ), - comfy_io.Float.Input( + IO.Float.Input( "canny_high_threshold", default=0.4, min=0.01, @@ -782,26 +782,26 @@ class FluxProCannyNode(comfy_io.ComfyNode): step=0.01, tooltip="High threshold for Canny edge detection; ignored if skip_processing is True", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "skip_preprocessing", default=False, tooltip="Whether to skip preprocessing; set to True if control_image already is canny-fied, False if it is a raw image.", ), - comfy_io.Float.Input( + IO.Float.Input( "guidance", default=30, min=1, max=100, tooltip="Guidance strength for the image generation process", ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=50, min=15, max=50, tooltip="Number of steps for the image generation process", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -810,11 +810,11 @@ class FluxProCannyNode(comfy_io.ComfyNode): tooltip="The random seed used for creating the noise.", ), ], - outputs=[comfy_io.Image.Output()], + outputs=[IO.Image.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -831,7 +831,7 @@ class FluxProCannyNode(comfy_io.ComfyNode): steps: int, guidance: float, seed=0, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: control_image = convert_image_to_base64(control_image[:, :, :, :3]) preprocessed_image = None @@ -872,54 +872,54 @@ class FluxProCannyNode(comfy_io.ComfyNode): }, ) output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return comfy_io.NodeOutput(output_image) + return IO.NodeOutput(output_image) -class FluxProDepthNode(comfy_io.ComfyNode): +class FluxProDepthNode(IO.ComfyNode): """ Generate image using a control image (depth). """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="FluxProDepthNode", display_name="Flux.1 Depth Control Image", category="api node/image/BFL", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("control_image"), - comfy_io.String.Input( + IO.Image.Input("control_image"), + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_upsampling", default=False, tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "skip_preprocessing", default=False, tooltip="Whether to skip preprocessing; set to True if control_image already is depth-ified, False if it is a raw image.", ), - comfy_io.Float.Input( + IO.Float.Input( "guidance", default=15, min=1, max=100, tooltip="Guidance strength for the image generation process", ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=50, min=15, max=50, tooltip="Number of steps for the image generation process", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -928,11 +928,11 @@ class FluxProDepthNode(comfy_io.ComfyNode): tooltip="The random seed used for creating the noise.", ), ], - outputs=[comfy_io.Image.Output()], + outputs=[IO.Image.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -947,7 +947,7 @@ class FluxProDepthNode(comfy_io.ComfyNode): steps: int, guidance: float, seed=0, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: control_image = convert_image_to_base64(control_image[:,:,:,:3]) preprocessed_image = None @@ -977,12 +977,12 @@ class FluxProDepthNode(comfy_io.ComfyNode): }, ) output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return comfy_io.NodeOutput(output_image) + return IO.NodeOutput(output_image) class BFLExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ FluxProUltraImageNode, # FluxProImageNode, diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index fcb01820c..f3d3f8d3e 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -7,7 +7,7 @@ from typing_extensions import override import torch from pydantic import BaseModel, Field -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, IO from comfy_api_nodes.util.validation_utils import ( validate_image_aspect_ratio_range, get_number_of_images, @@ -237,33 +237,33 @@ async def poll_until_finished( ).execute() -class ByteDanceImageNode(comfy_io.ComfyNode): +class ByteDanceImageNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceImageNode", display_name="ByteDance Image", category="api node/image/ByteDance", description="Generate images using ByteDance models via api based on prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=Text2ImageModelName, default=Text2ImageModelName.seedream_3, tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="The text prompt used to generate the image", ), - comfy_io.Combo.Input( + IO.Combo.Input( "size_preset", options=[label for label, _, _ in RECOMMENDED_PRESETS], tooltip="Pick a recommended size. Select Custom to use the width and height below", ), - comfy_io.Int.Input( + IO.Int.Input( "width", default=1024, min=512, @@ -271,7 +271,7 @@ class ByteDanceImageNode(comfy_io.ComfyNode): step=64, tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`", ), - comfy_io.Int.Input( + IO.Int.Input( "height", default=1024, min=512, @@ -279,28 +279,28 @@ class ByteDanceImageNode(comfy_io.ComfyNode): step=64, tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation", optional=True, ), - comfy_io.Float.Input( + IO.Float.Input( "guidance_scale", default=2.5, min=1.0, max=10.0, step=0.01, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Higher value makes the image follow the prompt more closely", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, tooltip="Whether to add an \"AI generated\" watermark to the image", @@ -308,12 +308,12 @@ class ByteDanceImageNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -329,7 +329,7 @@ class ByteDanceImageNode(comfy_io.ComfyNode): seed: int, guidance_scale: float, watermark: bool, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) w = h = None for label, tw, th in RECOMMENDED_PRESETS: @@ -367,57 +367,57 @@ class ByteDanceImageNode(comfy_io.ComfyNode): request=payload, auth_kwargs=auth_kwargs, ).execute() - return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) + return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) -class ByteDanceImageEditNode(comfy_io.ComfyNode): +class ByteDanceImageEditNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceImageEditNode", display_name="ByteDance Image Edit", category="api node/image/ByteDance", description="Edit images using ByteDance models via api based on prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=Image2ImageModelName, default=Image2ImageModelName.seededit_3, tooltip="Model name", ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="The base image to edit", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Instruction to edit image", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation", optional=True, ), - comfy_io.Float.Input( + IO.Float.Input( "guidance_scale", default=5.5, min=1.0, max=10.0, step=0.01, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Higher value makes the image follow the prompt more closely", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, tooltip="Whether to add an \"AI generated\" watermark to the image", @@ -425,12 +425,12 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -444,7 +444,7 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode): seed: int, guidance_scale: float, watermark: bool, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) if get_number_of_images(image) != 1: raise ValueError("Exactly one input image is required.") @@ -477,42 +477,42 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode): request=payload, auth_kwargs=auth_kwargs, ).execute() - return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) + return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) -class ByteDanceSeedreamNode(comfy_io.ComfyNode): +class ByteDanceSeedreamNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceSeedreamNode", display_name="ByteDance Seedream 4", category="api node/image/ByteDance", description="Unified text-to-image generation and precise single-sentence editing at up to 4K resolution.", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["seedream-4-0-250828"], tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text prompt for creating or editing an image.", ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="Input image(s) for image-to-image generation. " "List of 1-10 images for single or multi-reference generation.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "size_preset", options=[label for label, _, _ in RECOMMENDED_PRESETS_SEEDREAM_4], tooltip="Pick a recommended size. Select Custom to use the width and height below.", ), - comfy_io.Int.Input( + IO.Int.Input( "width", default=2048, min=1024, @@ -521,7 +521,7 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "height", default=2048, min=1024, @@ -530,7 +530,7 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "sequential_image_generation", options=["disabled", "auto"], tooltip="Group image generation mode. " @@ -539,35 +539,35 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): "(e.g., story scenes, character variations).", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "max_images", default=1, min=1, max=15, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Maximum number of images to generate when sequential_image_generation='auto'. " "Total images (input + generated) cannot exceed 15.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, tooltip="Whether to add an \"AI generated\" watermark to the image.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "fail_on_partial", default=True, tooltip="If enabled, abort execution if any requested images are missing or return an error.", @@ -575,12 +575,12 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -599,7 +599,7 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): seed: int = 0, watermark: bool = True, fail_on_partial: bool = True, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) w = h = None for label, tw, th in RECOMMENDED_PRESETS_SEEDREAM_4: @@ -657,72 +657,72 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): ).execute() if len(response.data) == 1: - return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) + return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) urls = [str(d["url"]) for d in response.data if isinstance(d, dict) and "url" in d] if fail_on_partial and len(urls) < len(response.data): raise RuntimeError(f"Only {len(urls)} of {len(response.data)} images were generated before error.") - return comfy_io.NodeOutput(torch.cat([await download_url_to_image_tensor(i) for i in urls])) + return IO.NodeOutput(torch.cat([await download_url_to_image_tensor(i) for i in urls])) -class ByteDanceTextToVideoNode(comfy_io.ComfyNode): +class ByteDanceTextToVideoNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceTextToVideoNode", display_name="ByteDance Text to Video", category="api node/video/ByteDance", description="Generate video using ByteDance models via api based on prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=Text2VideoModelName, default=Text2VideoModelName.seedance_1_pro, tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="The text prompt used to generate the video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=["480p", "720p", "1080p"], tooltip="The resolution of the output video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=["16:9", "4:3", "1:1", "3:4", "9:16", "21:9"], tooltip="The aspect ratio of the output video.", ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=3, max=12, step=1, tooltip="The duration of the output video in seconds.", - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "camera_fixed", default=False, tooltip="Specifies whether to fix the camera. The platform appends an instruction " "to fix the camera to your prompt, but does not guarantee the actual effect.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, tooltip="Whether to add an \"AI generated\" watermark to the video.", @@ -730,12 +730,12 @@ class ByteDanceTextToVideoNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -751,7 +751,7 @@ class ByteDanceTextToVideoNode(comfy_io.ComfyNode): seed: int, camera_fixed: bool, watermark: bool, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"]) @@ -781,69 +781,69 @@ class ByteDanceTextToVideoNode(comfy_io.ComfyNode): ) -class ByteDanceImageToVideoNode(comfy_io.ComfyNode): +class ByteDanceImageToVideoNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceImageToVideoNode", display_name="ByteDance Image to Video", category="api node/video/ByteDance", description="Generate video using ByteDance models via api based on image and prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=Image2VideoModelName, default=Image2VideoModelName.seedance_1_pro, tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="The text prompt used to generate the video.", ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="First frame to be used for the video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=["480p", "720p", "1080p"], tooltip="The resolution of the output video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=["adaptive", "16:9", "4:3", "1:1", "3:4", "9:16", "21:9"], tooltip="The aspect ratio of the output video.", ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=3, max=12, step=1, tooltip="The duration of the output video in seconds.", - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "camera_fixed", default=False, tooltip="Specifies whether to fix the camera. The platform appends an instruction " "to fix the camera to your prompt, but does not guarantee the actual effect.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, tooltip="Whether to add an \"AI generated\" watermark to the video.", @@ -851,12 +851,12 @@ class ByteDanceImageToVideoNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -873,7 +873,7 @@ class ByteDanceImageToVideoNode(comfy_io.ComfyNode): seed: int, camera_fixed: bool, watermark: bool, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"]) validate_image_dimensions(image, min_width=300, min_height=300, max_width=6000, max_height=6000) @@ -908,73 +908,73 @@ class ByteDanceImageToVideoNode(comfy_io.ComfyNode): ) -class ByteDanceFirstLastFrameNode(comfy_io.ComfyNode): +class ByteDanceFirstLastFrameNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceFirstLastFrameNode", display_name="ByteDance First-Last-Frame to Video", category="api node/video/ByteDance", description="Generate video using prompt and first and last frames.", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=[model.value for model in Image2VideoModelName], default=Image2VideoModelName.seedance_1_lite.value, tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="The text prompt used to generate the video.", ), - comfy_io.Image.Input( + IO.Image.Input( "first_frame", tooltip="First frame to be used for the video.", ), - comfy_io.Image.Input( + IO.Image.Input( "last_frame", tooltip="Last frame to be used for the video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=["480p", "720p", "1080p"], tooltip="The resolution of the output video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=["adaptive", "16:9", "4:3", "1:1", "3:4", "9:16", "21:9"], tooltip="The aspect ratio of the output video.", ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=3, max=12, step=1, tooltip="The duration of the output video in seconds.", - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "camera_fixed", default=False, tooltip="Specifies whether to fix the camera. The platform appends an instruction " "to fix the camera to your prompt, but does not guarantee the actual effect.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, tooltip="Whether to add an \"AI generated\" watermark to the video.", @@ -982,12 +982,12 @@ class ByteDanceFirstLastFrameNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1005,7 +1005,7 @@ class ByteDanceFirstLastFrameNode(comfy_io.ComfyNode): seed: int, camera_fixed: bool, watermark: bool, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"]) for i in (first_frame, last_frame): @@ -1050,62 +1050,62 @@ class ByteDanceFirstLastFrameNode(comfy_io.ComfyNode): ) -class ByteDanceImageReferenceNode(comfy_io.ComfyNode): +class ByteDanceImageReferenceNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceImageReferenceNode", display_name="ByteDance Reference Images to Video", category="api node/video/ByteDance", description="Generate video using prompt and reference images.", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=[Image2VideoModelName.seedance_1_lite.value], default=Image2VideoModelName.seedance_1_lite.value, tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="The text prompt used to generate the video.", ), - comfy_io.Image.Input( + IO.Image.Input( "images", tooltip="One to four images.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=["480p", "720p"], tooltip="The resolution of the output video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=["adaptive", "16:9", "4:3", "1:1", "3:4", "9:16", "21:9"], tooltip="The aspect ratio of the output video.", ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=3, max=12, step=1, tooltip="The duration of the output video in seconds.", - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, tooltip="Whether to add an \"AI generated\" watermark to the video.", @@ -1113,12 +1113,12 @@ class ByteDanceImageReferenceNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1134,7 +1134,7 @@ class ByteDanceImageReferenceNode(comfy_io.ComfyNode): duration: int, seed: int, watermark: bool, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "watermark"]) for image in images: @@ -1180,7 +1180,7 @@ async def process_video_task( auth_kwargs: dict, node_id: str, estimated_duration: Optional[int], -) -> comfy_io.NodeOutput: +) -> IO.NodeOutput: initial_response = await SynchronousOperation( endpoint=ApiEndpoint( path=BYTEPLUS_TASK_ENDPOINT, @@ -1197,7 +1197,7 @@ async def process_video_task( estimated_duration=estimated_duration, node_id=node_id, ) - return comfy_io.NodeOutput(await download_url_to_video_output(get_video_url_from_task_status(response))) + return IO.NodeOutput(await download_url_to_video_output(get_video_url_from_task_status(response))) def raise_if_text_params(prompt: str, text_params: list[str]) -> None: @@ -1210,7 +1210,7 @@ def raise_if_text_params(prompt: str, text_params: list[str]) -> None: class ByteDanceExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ ByteDanceImageNode, ByteDanceImageEditNode, diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index 2d1c32e4f..9eae5f11a 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -1,6 +1,6 @@ from io import BytesIO from typing_extensions import override -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, IO from PIL import Image import numpy as np import torch @@ -246,76 +246,76 @@ def display_image_urls_on_node(image_urls, node_id): PromptServer.instance.send_progress_text(urls_text, node_id) -class IdeogramV1(comfy_io.ComfyNode): +class IdeogramV1(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="IdeogramV1", display_name="Ideogram V1", category="api node/image/Ideogram", description="Generates images using the Ideogram V1 model.", is_api_node=True, inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "turbo", default=False, tooltip="Whether to use turbo mode (faster generation, potentially lower quality)", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=list(V1_V2_RATIO_MAP.keys()), default="1:1", tooltip="The aspect ratio for image generation.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "magic_prompt_option", options=["AUTO", "ON", "OFF"], default="AUTO", tooltip="Determine if MagicPrompt should be used in generation", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, control_after_generate=True, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Description of what to exclude from the image", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "num_images", default=1, min=1, max=8, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], ) @@ -372,39 +372,39 @@ class IdeogramV1(comfy_io.ComfyNode): raise Exception("No image URLs were generated in the response") display_image_urls_on_node(image_urls, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_and_process_images(image_urls)) + return IO.NodeOutput(await download_and_process_images(image_urls)) -class IdeogramV2(comfy_io.ComfyNode): +class IdeogramV2(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="IdeogramV2", display_name="Ideogram V2", category="api node/image/Ideogram", description="Generates images using the Ideogram V2 model.", is_api_node=True, inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "turbo", default=False, tooltip="Whether to use turbo mode (faster generation, potentially lower quality)", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=list(V1_V2_RATIO_MAP.keys()), default="1:1", tooltip="The aspect ratio for image generation. Ignored if resolution is not set to AUTO.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=list(V1_V1_RES_MAP.keys()), default="Auto", @@ -412,44 +412,44 @@ class IdeogramV2(comfy_io.ComfyNode): "If not set to AUTO, this overrides the aspect_ratio setting.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "magic_prompt_option", options=["AUTO", "ON", "OFF"], default="AUTO", tooltip="Determine if MagicPrompt should be used in generation", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, control_after_generate=True, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "style_type", options=["AUTO", "GENERAL", "REALISTIC", "DESIGN", "RENDER_3D", "ANIME"], default="NONE", tooltip="Style type for generation (V2 only)", optional=True, ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Description of what to exclude from the image", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "num_images", default=1, min=1, max=8, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), #"color_palette": ( @@ -462,12 +462,12 @@ class IdeogramV2(comfy_io.ComfyNode): #), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], ) @@ -541,14 +541,14 @@ class IdeogramV2(comfy_io.ComfyNode): raise Exception("No image URLs were generated in the response") display_image_urls_on_node(image_urls, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_and_process_images(image_urls)) + return IO.NodeOutput(await download_and_process_images(image_urls)) -class IdeogramV3(comfy_io.ComfyNode): +class IdeogramV3(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="IdeogramV3", display_name="Ideogram V3", category="api node/image/Ideogram", @@ -556,30 +556,30 @@ class IdeogramV3(comfy_io.ComfyNode): "Supports both regular image generation from text prompts and image editing with mask.", is_api_node=True, inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation or editing", ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="Optional reference image for image editing.", optional=True, ), - comfy_io.Mask.Input( + IO.Mask.Input( "mask", tooltip="Optional mask for inpainting (white areas will be replaced)", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=list(V3_RATIO_MAP.keys()), default="1:1", tooltip="The aspect ratio for image generation. Ignored if resolution is not set to Auto.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=V3_RESOLUTIONS, default="Auto", @@ -587,57 +587,57 @@ class IdeogramV3(comfy_io.ComfyNode): "If not set to Auto, this overrides the aspect_ratio setting.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "magic_prompt_option", options=["AUTO", "ON", "OFF"], default="AUTO", tooltip="Determine if MagicPrompt should be used in generation", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, control_after_generate=True, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "num_images", default=1, min=1, max=8, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "rendering_speed", options=["DEFAULT", "TURBO", "QUALITY"], default="DEFAULT", tooltip="Controls the trade-off between generation speed and quality", optional=True, ), - comfy_io.Image.Input( + IO.Image.Input( "character_image", tooltip="Image to use as character reference.", optional=True, ), - comfy_io.Mask.Input( + IO.Mask.Input( "character_mask", tooltip="Optional mask for character reference image.", optional=True, ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], ) @@ -826,12 +826,12 @@ class IdeogramV3(comfy_io.ComfyNode): raise Exception("No image URLs were generated in the response") display_image_urls_on_node(image_urls, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_and_process_images(image_urls)) + return IO.NodeOutput(await download_and_process_images(image_urls)) class IdeogramExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ IdeogramV1, IdeogramV2, diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 2117cfa91..67c8307c5 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -76,7 +76,7 @@ from comfy_api_nodes.util.validation_utils import ( from comfy_api.input_impl import VideoFromFile from comfy_api.input.basic_types import AudioInput from comfy_api.input.video_types import VideoInput -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, IO KLING_API_VERSION = "v1" PATH_TEXT_TO_VIDEO = f"/proxy/kling/{KLING_API_VERSION}/videos/text2video" @@ -387,7 +387,7 @@ async def execute_text2video( duration: str, aspect_ratio: str, camera_control: Optional[KlingCameraControl] = None, -) -> comfy_io.NodeOutput: +) -> IO.NodeOutput: validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V) initial_operation = SynchronousOperation( endpoint=ApiEndpoint( @@ -428,7 +428,7 @@ async def execute_text2video( validate_video_result_response(final_response) video = get_video_from_response(final_response) - return comfy_io.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) + return IO.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) async def execute_image2video( @@ -444,7 +444,7 @@ async def execute_image2video( duration: str, camera_control: Optional[KlingCameraControl] = None, end_frame: Optional[torch.Tensor] = None, -) -> comfy_io.NodeOutput: +) -> IO.NodeOutput: validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_I2V) validate_input_image(start_frame) @@ -499,7 +499,7 @@ async def execute_image2video( validate_video_result_response(final_response) video = get_video_from_response(final_response) - return comfy_io.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) + return IO.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) async def execute_video_effect( @@ -576,7 +576,7 @@ async def execute_lipsync( text: Optional[str] = None, voice_speed: Optional[float] = None, voice_id: Optional[str] = None, -) -> comfy_io.NodeOutput: +) -> IO.NodeOutput: if text: validate_string(text, field_name="Text", max_length=MAX_PROMPT_LENGTH_LIP_SYNC) validate_video_dimensions(video, 720, 1920) @@ -634,77 +634,77 @@ async def execute_lipsync( validate_video_result_response(final_response) video = get_video_from_response(final_response) - return comfy_io.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) + return IO.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) -class KlingCameraControls(comfy_io.ComfyNode): +class KlingCameraControls(IO.ComfyNode): """Kling Camera Controls Node""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="KlingCameraControls", display_name="Kling Camera Controls", category="api node/video/Kling", description="Allows specifying configuration options for Kling Camera Controls and motion control effects.", inputs=[ - comfy_io.Combo.Input("camera_control_type", options=KlingCameraControlType), - comfy_io.Float.Input( + IO.Combo.Input("camera_control_type", options=KlingCameraControlType), + IO.Float.Input( "horizontal_movement", default=0.0, min=-10.0, max=10.0, step=0.25, - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, tooltip="Controls camera's movement along horizontal axis (x-axis). Negative indicates left, positive indicates right", ), - comfy_io.Float.Input( + IO.Float.Input( "vertical_movement", default=0.0, min=-10.0, max=10.0, step=0.25, - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, tooltip="Controls camera's movement along vertical axis (y-axis). Negative indicates downward, positive indicates upward.", ), - comfy_io.Float.Input( + IO.Float.Input( "pan", default=0.5, min=-10.0, max=10.0, step=0.25, - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, tooltip="Controls camera's rotation in vertical plane (x-axis). Negative indicates downward rotation, positive indicates upward rotation.", ), - comfy_io.Float.Input( + IO.Float.Input( "tilt", default=0.0, min=-10.0, max=10.0, step=0.25, - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, tooltip="Controls camera's rotation in horizontal plane (y-axis). Negative indicates left rotation, positive indicates right rotation.", ), - comfy_io.Float.Input( + IO.Float.Input( "roll", default=0.0, min=-10.0, max=10.0, step=0.25, - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, tooltip="Controls camera's rolling amount (z-axis). Negative indicates counterclockwise, positive indicates clockwise.", ), - comfy_io.Float.Input( + IO.Float.Input( "zoom", default=0.0, min=-10.0, max=10.0, step=0.25, - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, tooltip="Controls change in camera's focal length. Negative indicates narrower field of view, positive indicates wider field of view.", ), ], - outputs=[comfy_io.Custom("CAMERA_CONTROL").Output(display_name="camera_control")], + outputs=[IO.Custom("CAMERA_CONTROL").Output(display_name="camera_control")], ) @classmethod @@ -740,8 +740,8 @@ class KlingCameraControls(comfy_io.ComfyNode): tilt: float, roll: float, zoom: float, - ) -> comfy_io.NodeOutput: - return comfy_io.NodeOutput( + ) -> IO.NodeOutput: + return IO.NodeOutput( KlingCameraControl( type=KlingCameraControlType(camera_control_type), config=KlingCameraConfig( @@ -756,27 +756,27 @@ class KlingCameraControls(comfy_io.ComfyNode): ) -class KlingTextToVideoNode(comfy_io.ComfyNode): +class KlingTextToVideoNode(IO.ComfyNode): """Kling Text to Video Node""" @classmethod - def define_schema(cls) -> comfy_io.Schema: + def define_schema(cls) -> IO.Schema: modes = list(MODE_TEXT2VIDEO.keys()) - return comfy_io.Schema( + return IO.Schema( node_id="KlingTextToVideoNode", display_name="Kling Text to Video", category="api node/video/Kling", description="Kling Text to Video Node", inputs=[ - comfy_io.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), - comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), - comfy_io.Float.Input("cfg_scale", default=1.0, min=0.0, max=1.0), - comfy_io.Combo.Input( + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + IO.Float.Input("cfg_scale", default=1.0, min=0.0, max=1.0), + IO.Combo.Input( "aspect_ratio", options=KlingVideoGenAspectRatio, default="16:9", ), - comfy_io.Combo.Input( + IO.Combo.Input( "mode", options=modes, default=modes[4], @@ -784,14 +784,14 @@ class KlingTextToVideoNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), - comfy_io.String.Output(display_name="video_id"), - comfy_io.String.Output(display_name="duration"), + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -804,7 +804,7 @@ class KlingTextToVideoNode(comfy_io.ComfyNode): cfg_scale: float, mode: str, aspect_ratio: str, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: model_mode, duration, model_name = MODE_TEXT2VIDEO[mode] return await execute_text2video( auth_kwargs={ @@ -822,42 +822,42 @@ class KlingTextToVideoNode(comfy_io.ComfyNode): ) -class KlingCameraControlT2VNode(comfy_io.ComfyNode): +class KlingCameraControlT2VNode(IO.ComfyNode): """ Kling Text to Video Camera Control Node. This node is a text to video node, but it supports controlling the camera. Duration, mode, and model_name request fields are hard-coded because camera control is only supported in pro mode with the kling-v1-5 model at 5s duration as of 2025-05-02. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="KlingCameraControlT2VNode", display_name="Kling Text to Video (Camera Control)", category="api node/video/Kling", description="Transform text into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original text.", inputs=[ - comfy_io.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), - comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), - comfy_io.Float.Input("cfg_scale", default=0.75, min=0.0, max=1.0), - comfy_io.Combo.Input( + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + IO.Float.Input("cfg_scale", default=0.75, min=0.0, max=1.0), + IO.Combo.Input( "aspect_ratio", options=KlingVideoGenAspectRatio, default="16:9", ), - comfy_io.Custom("CAMERA_CONTROL").Input( + IO.Custom("CAMERA_CONTROL").Input( "camera_control", tooltip="Can be created using the Kling Camera Controls node. Controls the camera movement and motion during the video generation.", ), ], outputs=[ - comfy_io.Video.Output(), - comfy_io.String.Output(display_name="video_id"), - comfy_io.String.Output(display_name="duration"), + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -870,7 +870,7 @@ class KlingCameraControlT2VNode(comfy_io.ComfyNode): cfg_scale: float, aspect_ratio: str, camera_control: Optional[KlingCameraControl] = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: return await execute_text2video( auth_kwargs={ "auth_token": cls.hidden.auth_token_comfy_org, @@ -888,43 +888,43 @@ class KlingCameraControlT2VNode(comfy_io.ComfyNode): ) -class KlingImage2VideoNode(comfy_io.ComfyNode): +class KlingImage2VideoNode(IO.ComfyNode): """Kling Image to Video Node""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="KlingImage2VideoNode", display_name="Kling Image to Video", category="api node/video/Kling", description="Kling Image to Video Node", inputs=[ - comfy_io.Image.Input("start_frame", tooltip="The reference image used to generate the video."), - comfy_io.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), - comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), - comfy_io.Combo.Input( + IO.Image.Input("start_frame", tooltip="The reference image used to generate the video."), + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + IO.Combo.Input( "model_name", options=KlingVideoGenModelName, default="kling-v2-master", ), - comfy_io.Float.Input("cfg_scale", default=0.8, min=0.0, max=1.0), - comfy_io.Combo.Input("mode", options=KlingVideoGenMode, default=KlingVideoGenMode.std), - comfy_io.Combo.Input( + IO.Float.Input("cfg_scale", default=0.8, min=0.0, max=1.0), + IO.Combo.Input("mode", options=KlingVideoGenMode, default=KlingVideoGenMode.std), + IO.Combo.Input( "aspect_ratio", options=KlingVideoGenAspectRatio, default=KlingVideoGenAspectRatio.field_16_9, ), - comfy_io.Combo.Input("duration", options=KlingVideoGenDuration, default=KlingVideoGenDuration.field_5), + IO.Combo.Input("duration", options=KlingVideoGenDuration, default=KlingVideoGenDuration.field_5), ], outputs=[ - comfy_io.Video.Output(), - comfy_io.String.Output(display_name="video_id"), - comfy_io.String.Output(display_name="duration"), + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -942,7 +942,7 @@ class KlingImage2VideoNode(comfy_io.ComfyNode): duration: str, camera_control: Optional[KlingCameraControl] = None, end_frame: Optional[torch.Tensor] = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: return await execute_image2video( auth_kwargs={ "auth_token": cls.hidden.auth_token_comfy_org, @@ -962,46 +962,46 @@ class KlingImage2VideoNode(comfy_io.ComfyNode): ) -class KlingCameraControlI2VNode(comfy_io.ComfyNode): +class KlingCameraControlI2VNode(IO.ComfyNode): """ Kling Image to Video Camera Control Node. This node is a image to video node, but it supports controlling the camera. Duration, mode, and model_name request fields are hard-coded because camera control is only supported in pro mode with the kling-v1-5 model at 5s duration as of 2025-05-02. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="KlingCameraControlI2VNode", display_name="Kling Image to Video (Camera Control)", category="api node/video/Kling", description="Transform still images into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original image.", inputs=[ - comfy_io.Image.Input( + IO.Image.Input( "start_frame", tooltip="Reference Image - URL or Base64 encoded string, cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1. Base64 should not include data:image prefix.", ), - comfy_io.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), - comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), - comfy_io.Float.Input("cfg_scale", default=0.75, min=0.0, max=1.0), - comfy_io.Combo.Input( + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + IO.Float.Input("cfg_scale", default=0.75, min=0.0, max=1.0), + IO.Combo.Input( "aspect_ratio", options=KlingVideoGenAspectRatio, default=KlingVideoGenAspectRatio.field_16_9, ), - comfy_io.Custom("CAMERA_CONTROL").Input( + IO.Custom("CAMERA_CONTROL").Input( "camera_control", tooltip="Can be created using the Kling Camera Controls node. Controls the camera movement and motion during the video generation.", ), ], outputs=[ - comfy_io.Video.Output(), - comfy_io.String.Output(display_name="video_id"), - comfy_io.String.Output(display_name="duration"), + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1015,7 +1015,7 @@ class KlingCameraControlI2VNode(comfy_io.ComfyNode): cfg_scale: float, aspect_ratio: str, camera_control: KlingCameraControl, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: return await execute_image2video( auth_kwargs={ "auth_token": cls.hidden.auth_token_comfy_org, @@ -1034,37 +1034,37 @@ class KlingCameraControlI2VNode(comfy_io.ComfyNode): ) -class KlingStartEndFrameNode(comfy_io.ComfyNode): +class KlingStartEndFrameNode(IO.ComfyNode): """ Kling First Last Frame Node. This node allows creation of a video from a first and last frame. It calls the normal image to video endpoint, but only allows the subset of input options that support the `image_tail` request field. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: + def define_schema(cls) -> IO.Schema: modes = list(MODE_START_END_FRAME.keys()) - return comfy_io.Schema( + return IO.Schema( node_id="KlingStartEndFrameNode", display_name="Kling Start-End Frame to Video", category="api node/video/Kling", description="Generate a video sequence that transitions between your provided start and end images. The node creates all frames in between, producing a smooth transformation from the first frame to the last.", inputs=[ - comfy_io.Image.Input( + IO.Image.Input( "start_frame", tooltip="Reference Image - URL or Base64 encoded string, cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1. Base64 should not include data:image prefix.", ), - comfy_io.Image.Input( + IO.Image.Input( "end_frame", tooltip="Reference Image - End frame control. URL or Base64 encoded string, cannot exceed 10MB, resolution not less than 300*300px. Base64 should not include data:image prefix.", ), - comfy_io.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), - comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), - comfy_io.Float.Input("cfg_scale", default=0.5, min=0.0, max=1.0), - comfy_io.Combo.Input( + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + IO.Float.Input("cfg_scale", default=0.5, min=0.0, max=1.0), + IO.Combo.Input( "aspect_ratio", options=[i.value for i in KlingVideoGenAspectRatio], default="16:9", ), - comfy_io.Combo.Input( + IO.Combo.Input( "mode", options=modes, default=modes[2], @@ -1072,14 +1072,14 @@ class KlingStartEndFrameNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), - comfy_io.String.Output(display_name="video_id"), - comfy_io.String.Output(display_name="duration"), + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1094,7 +1094,7 @@ class KlingStartEndFrameNode(comfy_io.ComfyNode): cfg_scale: float, aspect_ratio: str, mode: str, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: mode, duration, model_name = MODE_START_END_FRAME[mode] return await execute_image2video( auth_kwargs={ @@ -1114,41 +1114,41 @@ class KlingStartEndFrameNode(comfy_io.ComfyNode): ) -class KlingVideoExtendNode(comfy_io.ComfyNode): +class KlingVideoExtendNode(IO.ComfyNode): @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="KlingVideoExtendNode", display_name="Kling Video Extend", category="api node/video/Kling", description="Kling Video Extend Node. Extend videos made by other Kling nodes. The video_id is created by using other Kling Nodes.", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="Positive text prompt for guiding the video extension", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, tooltip="Negative text prompt for elements to avoid in the extended video", ), - comfy_io.Float.Input("cfg_scale", default=0.5, min=0.0, max=1.0), - comfy_io.String.Input( + IO.Float.Input("cfg_scale", default=0.5, min=0.0, max=1.0), + IO.String.Input( "video_id", force_input=True, tooltip="The ID of the video to be extended. Supports videos generated by text-to-video, image-to-video, and previous video extension operations. Cannot exceed 3 minutes total duration after extension.", ), ], outputs=[ - comfy_io.Video.Output(), - comfy_io.String.Output(display_name="video_id"), - comfy_io.String.Output(display_name="duration"), + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1160,7 +1160,7 @@ class KlingVideoExtendNode(comfy_io.ComfyNode): negative_prompt: str, cfg_scale: float, video_id: str, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V) auth = { "auth_token": cls.hidden.auth_token_comfy_org, @@ -1201,49 +1201,49 @@ class KlingVideoExtendNode(comfy_io.ComfyNode): validate_video_result_response(final_response) video = get_video_from_response(final_response) - return comfy_io.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) + return IO.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) -class KlingDualCharacterVideoEffectNode(comfy_io.ComfyNode): +class KlingDualCharacterVideoEffectNode(IO.ComfyNode): """Kling Dual Character Video Effect Node""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="KlingDualCharacterVideoEffectNode", display_name="Kling Dual Character Video Effects", category="api node/video/Kling", description="Achieve different special effects when generating a video based on the effect_scene. First image will be positioned on left side, second on right side of the composite.", inputs=[ - comfy_io.Image.Input("image_left", tooltip="Left side image"), - comfy_io.Image.Input("image_right", tooltip="Right side image"), - comfy_io.Combo.Input( + IO.Image.Input("image_left", tooltip="Left side image"), + IO.Image.Input("image_right", tooltip="Right side image"), + IO.Combo.Input( "effect_scene", options=[i.value for i in KlingDualCharacterEffectsScene], ), - comfy_io.Combo.Input( + IO.Combo.Input( "model_name", options=[i.value for i in KlingCharacterEffectModelName], default="kling-v1", ), - comfy_io.Combo.Input( + IO.Combo.Input( "mode", options=[i.value for i in KlingVideoGenMode], default="std", ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", options=[i.value for i in KlingVideoGenDuration], ), ], outputs=[ - comfy_io.Video.Output(), - comfy_io.String.Output(display_name="duration"), + IO.Video.Output(), + IO.String.Output(display_name="duration"), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1257,7 +1257,7 @@ class KlingDualCharacterVideoEffectNode(comfy_io.ComfyNode): model_name: KlingCharacterEffectModelName, mode: KlingVideoGenMode, duration: KlingVideoGenDuration, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: video, _, duration = await execute_video_effect( auth_kwargs={ "auth_token": cls.hidden.auth_token_comfy_org, @@ -1272,43 +1272,43 @@ class KlingDualCharacterVideoEffectNode(comfy_io.ComfyNode): image_1=image_left, image_2=image_right, ) - return comfy_io.NodeOutput(video, duration) + return IO.NodeOutput(video, duration) -class KlingSingleImageVideoEffectNode(comfy_io.ComfyNode): +class KlingSingleImageVideoEffectNode(IO.ComfyNode): """Kling Single Image Video Effect Node""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="KlingSingleImageVideoEffectNode", display_name="Kling Video Effects", category="api node/video/Kling", description="Achieve different special effects when generating a video based on the effect_scene.", inputs=[ - comfy_io.Image.Input("image", tooltip=" Reference Image. URL or Base64 encoded string (without data:image prefix). File size cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1"), - comfy_io.Combo.Input( + IO.Image.Input("image", tooltip=" Reference Image. URL or Base64 encoded string (without data:image prefix). File size cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1"), + IO.Combo.Input( "effect_scene", options=[i.value for i in KlingSingleImageEffectsScene], ), - comfy_io.Combo.Input( + IO.Combo.Input( "model_name", options=[i.value for i in KlingSingleImageEffectModelName], ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", options=[i.value for i in KlingVideoGenDuration], ), ], outputs=[ - comfy_io.Video.Output(), - comfy_io.String.Output(display_name="video_id"), - comfy_io.String.Output(display_name="duration"), + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1320,8 +1320,8 @@ class KlingSingleImageVideoEffectNode(comfy_io.ComfyNode): effect_scene: KlingSingleImageEffectsScene, model_name: KlingSingleImageEffectModelName, duration: KlingVideoGenDuration, - ) -> comfy_io.NodeOutput: - return comfy_io.NodeOutput( + ) -> IO.NodeOutput: + return IO.NodeOutput( *( await execute_video_effect( auth_kwargs={ @@ -1339,34 +1339,34 @@ class KlingSingleImageVideoEffectNode(comfy_io.ComfyNode): ) -class KlingLipSyncAudioToVideoNode(comfy_io.ComfyNode): +class KlingLipSyncAudioToVideoNode(IO.ComfyNode): """Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file.""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="KlingLipSyncAudioToVideoNode", display_name="Kling Lip Sync Video with Audio", category="api node/video/Kling", description="Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file. When using, ensure that the audio contains clearly distinguishable vocals and that the video contains a distinct face. The audio file should not be larger than 5MB. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.", inputs=[ - comfy_io.Video.Input("video"), - comfy_io.Audio.Input("audio"), - comfy_io.Combo.Input( + IO.Video.Input("video"), + IO.Audio.Input("audio"), + IO.Combo.Input( "voice_language", options=[i.value for i in KlingLipSyncVoiceLanguage], default="en", ), ], outputs=[ - comfy_io.Video.Output(), - comfy_io.String.Output(display_name="video_id"), - comfy_io.String.Output(display_name="duration"), + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1377,7 +1377,7 @@ class KlingLipSyncAudioToVideoNode(comfy_io.ComfyNode): video: VideoInput, audio: AudioInput, voice_language: str, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: return await execute_lipsync( auth_kwargs={ "auth_token": cls.hidden.auth_token_comfy_org, @@ -1391,46 +1391,46 @@ class KlingLipSyncAudioToVideoNode(comfy_io.ComfyNode): ) -class KlingLipSyncTextToVideoNode(comfy_io.ComfyNode): +class KlingLipSyncTextToVideoNode(IO.ComfyNode): """Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt.""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="KlingLipSyncTextToVideoNode", display_name="Kling Lip Sync Video with Text", category="api node/video/Kling", description="Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.", inputs=[ - comfy_io.Video.Input("video"), - comfy_io.String.Input( + IO.Video.Input("video"), + IO.String.Input( "text", multiline=True, tooltip="Text Content for Lip-Sync Video Generation. Required when mode is text2video. Maximum length is 120 characters.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "voice", options=list(VOICES_CONFIG.keys()), default="Melody", ), - comfy_io.Float.Input( + IO.Float.Input( "voice_speed", default=1, min=0.8, max=2.0, - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, tooltip="Speech Rate. Valid range: 0.8~2.0, accurate to one decimal place.", ), ], outputs=[ - comfy_io.Video.Output(), - comfy_io.String.Output(display_name="video_id"), - comfy_io.String.Output(display_name="duration"), + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1442,7 +1442,7 @@ class KlingLipSyncTextToVideoNode(comfy_io.ComfyNode): text: str, voice: str, voice_speed: float, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: voice_id, voice_language = VOICES_CONFIG[voice] return await execute_lipsync( auth_kwargs={ @@ -1459,32 +1459,32 @@ class KlingLipSyncTextToVideoNode(comfy_io.ComfyNode): ) -class KlingVirtualTryOnNode(comfy_io.ComfyNode): +class KlingVirtualTryOnNode(IO.ComfyNode): """Kling Virtual Try On Node.""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="KlingVirtualTryOnNode", display_name="Kling Virtual Try On", category="api node/image/Kling", description="Kling Virtual Try On Node. Input a human image and a cloth image to try on the cloth on the human. You can merge multiple clothing item pictures into one image with a white background.", inputs=[ - comfy_io.Image.Input("human_image"), - comfy_io.Image.Input("cloth_image"), - comfy_io.Combo.Input( + IO.Image.Input("human_image"), + IO.Image.Input("cloth_image"), + IO.Combo.Input( "model_name", options=[i.value for i in KlingVirtualTryOnModelName], default="kolors-virtual-try-on-v1", ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1495,7 +1495,7 @@ class KlingVirtualTryOnNode(comfy_io.ComfyNode): human_image: torch.Tensor, cloth_image: torch.Tensor, model_name: KlingVirtualTryOnModelName, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: auth = { "auth_token": cls.hidden.auth_token_comfy_org, "comfy_api_key": cls.hidden.api_key_comfy_org, @@ -1534,70 +1534,70 @@ class KlingVirtualTryOnNode(comfy_io.ComfyNode): validate_image_result_response(final_response) images = get_images_from_response(final_response) - return comfy_io.NodeOutput(await image_result_to_node_output(images)) + return IO.NodeOutput(await image_result_to_node_output(images)) -class KlingImageGenerationNode(comfy_io.ComfyNode): +class KlingImageGenerationNode(IO.ComfyNode): """Kling Image Generation Node. Generate an image from a text prompt with an optional reference image.""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="KlingImageGenerationNode", display_name="Kling Image Generation", category="api node/image/Kling", description="Kling Image Generation Node. Generate an image from a text prompt with an optional reference image.", inputs=[ - comfy_io.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), - comfy_io.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), - comfy_io.Combo.Input( + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + IO.Combo.Input( "image_type", options=[i.value for i in KlingImageGenImageReferenceType], ), - comfy_io.Float.Input( + IO.Float.Input( "image_fidelity", default=0.5, min=0.0, max=1.0, step=0.01, - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, tooltip="Reference intensity for user-uploaded images", ), - comfy_io.Float.Input( + IO.Float.Input( "human_fidelity", default=0.45, min=0.0, max=1.0, step=0.01, - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, tooltip="Subject reference similarity", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model_name", options=[i.value for i in KlingImageGenModelName], default="kling-v1", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=[i.value for i in KlingImageGenAspectRatio], default="16:9", ), - comfy_io.Int.Input( + IO.Int.Input( "n", default=1, min=1, max=9, tooltip="Number of generated images", ), - comfy_io.Image.Input("image", optional=True), + IO.Image.Input("image", optional=True), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1614,7 +1614,7 @@ class KlingImageGenerationNode(comfy_io.ComfyNode): n: int, aspect_ratio: KlingImageGenAspectRatio, image: Optional[torch.Tensor] = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, field_name="prompt", min_length=1, max_length=MAX_PROMPT_LENGTH_IMAGE_GEN) validate_string(negative_prompt, field_name="negative_prompt", max_length=MAX_PROMPT_LENGTH_IMAGE_GEN) @@ -1669,12 +1669,12 @@ class KlingImageGenerationNode(comfy_io.ComfyNode): validate_image_result_response(final_response) images = get_images_from_response(final_response) - return comfy_io.NodeOutput(await image_result_to_node_output(images)) + return IO.NodeOutput(await image_result_to_node_output(images)) class KlingExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ KlingCameraControls, KlingTextToVideoNode, diff --git a/comfy_api_nodes/nodes_luma.py b/comfy_api_nodes/nodes_luma.py index 9cab2ca82..610d95a77 100644 --- a/comfy_api_nodes/nodes_luma.py +++ b/comfy_api_nodes/nodes_luma.py @@ -2,7 +2,7 @@ from __future__ import annotations from inspect import cleandoc from typing import Optional from typing_extensions import override -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, IO from comfy_api.input_impl.video_types import VideoFromFile from comfy_api_nodes.apis.luma_api import ( LumaImageModel, @@ -52,24 +52,24 @@ def image_result_url_extractor(response: LumaGeneration): def video_result_url_extractor(response: LumaGeneration): return response.assets.video if hasattr(response, "assets") and hasattr(response.assets, "video") else None -class LumaReferenceNode(comfy_io.ComfyNode): +class LumaReferenceNode(IO.ComfyNode): """ Holds an image and weight for use with Luma Generate Image node. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="LumaReferenceNode", display_name="Luma Reference", category="api node/image/Luma", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="Image to use as reference.", ), - comfy_io.Float.Input( + IO.Float.Input( "weight", default=1.0, min=0.0, @@ -77,71 +77,71 @@ class LumaReferenceNode(comfy_io.ComfyNode): step=0.01, tooltip="Weight of image reference.", ), - comfy_io.Custom(LumaIO.LUMA_REF).Input( + IO.Custom(LumaIO.LUMA_REF).Input( "luma_ref", optional=True, ), ], - outputs=[comfy_io.Custom(LumaIO.LUMA_REF).Output(display_name="luma_ref")], + outputs=[IO.Custom(LumaIO.LUMA_REF).Output(display_name="luma_ref")], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], ) @classmethod def execute( cls, image: torch.Tensor, weight: float, luma_ref: LumaReferenceChain = None - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: if luma_ref is not None: luma_ref = luma_ref.clone() else: luma_ref = LumaReferenceChain() luma_ref.add(LumaReference(image=image, weight=round(weight, 2))) - return comfy_io.NodeOutput(luma_ref) + return IO.NodeOutput(luma_ref) -class LumaConceptsNode(comfy_io.ComfyNode): +class LumaConceptsNode(IO.ComfyNode): """ Holds one or more Camera Concepts for use with Luma Text to Video and Luma Image to Video nodes. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="LumaConceptsNode", display_name="Luma Concepts", category="api node/video/Luma", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "concept1", options=get_luma_concepts(include_none=True), ), - comfy_io.Combo.Input( + IO.Combo.Input( "concept2", options=get_luma_concepts(include_none=True), ), - comfy_io.Combo.Input( + IO.Combo.Input( "concept3", options=get_luma_concepts(include_none=True), ), - comfy_io.Combo.Input( + IO.Combo.Input( "concept4", options=get_luma_concepts(include_none=True), ), - comfy_io.Custom(LumaIO.LUMA_CONCEPTS).Input( + IO.Custom(LumaIO.LUMA_CONCEPTS).Input( "luma_concepts", tooltip="Optional Camera Concepts to add to the ones chosen here.", optional=True, ), ], - outputs=[comfy_io.Custom(LumaIO.LUMA_CONCEPTS).Output(display_name="luma_concepts")], + outputs=[IO.Custom(LumaIO.LUMA_CONCEPTS).Output(display_name="luma_concepts")], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], ) @@ -153,42 +153,42 @@ class LumaConceptsNode(comfy_io.ComfyNode): concept3: str, concept4: str, luma_concepts: LumaConceptChain = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: chain = LumaConceptChain(str_list=[concept1, concept2, concept3, concept4]) if luma_concepts is not None: chain = luma_concepts.clone_and_merge(chain) - return comfy_io.NodeOutput(chain) + return IO.NodeOutput(chain) -class LumaImageGenerationNode(comfy_io.ComfyNode): +class LumaImageGenerationNode(IO.ComfyNode): """ Generates images synchronously based on prompt and aspect ratio. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="LumaImageNode", display_name="Luma Text to Image", category="api node/image/Luma", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=LumaImageModel, ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=LumaAspectRatio, default=LumaAspectRatio.ratio_16_9, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -196,7 +196,7 @@ class LumaImageGenerationNode(comfy_io.ComfyNode): control_after_generate=True, tooltip="Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", ), - comfy_io.Float.Input( + IO.Float.Input( "style_image_weight", default=1.0, min=0.0, @@ -204,27 +204,27 @@ class LumaImageGenerationNode(comfy_io.ComfyNode): step=0.01, tooltip="Weight of style image. Ignored if no style_image provided.", ), - comfy_io.Custom(LumaIO.LUMA_REF).Input( + IO.Custom(LumaIO.LUMA_REF).Input( "image_luma_ref", tooltip="Luma Reference node connection to influence generation with input images; up to 4 images can be considered.", optional=True, ), - comfy_io.Image.Input( + IO.Image.Input( "style_image", tooltip="Style reference image; only 1 image will be used.", optional=True, ), - comfy_io.Image.Input( + IO.Image.Input( "character_image", tooltip="Character reference images; can be a batch of multiple, up to 4 images can be considered.", optional=True, ), ], - outputs=[comfy_io.Image.Output()], + outputs=[IO.Image.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -240,7 +240,7 @@ class LumaImageGenerationNode(comfy_io.ComfyNode): image_luma_ref: LumaReferenceChain = None, style_image: torch.Tensor = None, character_image: torch.Tensor = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=3) auth_kwargs = { "auth_token": cls.hidden.auth_token_comfy_org, @@ -306,7 +306,7 @@ class LumaImageGenerationNode(comfy_io.ComfyNode): async with aiohttp.ClientSession() as session: async with session.get(response_poll.assets.image) as img_response: img = process_image_response(await img_response.content.read()) - return comfy_io.NodeOutput(img) + return IO.NodeOutput(img) @classmethod async def _convert_luma_refs( @@ -334,29 +334,29 @@ class LumaImageGenerationNode(comfy_io.ComfyNode): return await cls._convert_luma_refs(chain, max_refs=1, auth_kwargs=auth_kwargs) -class LumaImageModifyNode(comfy_io.ComfyNode): +class LumaImageModifyNode(IO.ComfyNode): """ Modifies images synchronously based on prompt and aspect ratio. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="LumaImageModifyNode", display_name="Luma Image to Image", category="api node/image/Luma", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input( + IO.Image.Input( "image", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation", ), - comfy_io.Float.Input( + IO.Float.Input( "image_weight", default=0.1, min=0.0, @@ -364,11 +364,11 @@ class LumaImageModifyNode(comfy_io.ComfyNode): step=0.01, tooltip="Weight of the image; the closer to 1.0, the less the image will be modified.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=LumaImageModel, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -377,11 +377,11 @@ class LumaImageModifyNode(comfy_io.ComfyNode): tooltip="Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", ), ], - outputs=[comfy_io.Image.Output()], + outputs=[IO.Image.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -394,7 +394,7 @@ class LumaImageModifyNode(comfy_io.ComfyNode): image: torch.Tensor, image_weight: float, seed, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: auth_kwargs = { "auth_token": cls.hidden.auth_token_comfy_org, "comfy_api_key": cls.hidden.api_key_comfy_org, @@ -442,51 +442,51 @@ class LumaImageModifyNode(comfy_io.ComfyNode): async with aiohttp.ClientSession() as session: async with session.get(response_poll.assets.image) as img_response: img = process_image_response(await img_response.content.read()) - return comfy_io.NodeOutput(img) + return IO.NodeOutput(img) -class LumaTextToVideoGenerationNode(comfy_io.ComfyNode): +class LumaTextToVideoGenerationNode(IO.ComfyNode): """ Generates videos synchronously based on prompt and output_size. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="LumaVideoNode", display_name="Luma Text to Video", category="api node/video/Luma", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the video generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=LumaVideoModel, ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=LumaAspectRatio, default=LumaAspectRatio.ratio_16_9, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=LumaVideoOutputResolution, default=LumaVideoOutputResolution.res_540p, ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", options=LumaVideoModelOutputDuration, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "loop", default=False, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -494,17 +494,17 @@ class LumaTextToVideoGenerationNode(comfy_io.ComfyNode): control_after_generate=True, tooltip="Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", ), - comfy_io.Custom(LumaIO.LUMA_CONCEPTS).Input( + IO.Custom(LumaIO.LUMA_CONCEPTS).Input( "luma_concepts", tooltip="Optional Camera Concepts to dictate camera motion via the Luma Concepts node.", optional=True, ) ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -520,7 +520,7 @@ class LumaTextToVideoGenerationNode(comfy_io.ComfyNode): loop: bool, seed, luma_concepts: LumaConceptChain = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False, min_length=3) duration = duration if model != LumaVideoModel.ray_1_6 else None resolution = resolution if model != LumaVideoModel.ray_1_6 else None @@ -571,51 +571,51 @@ class LumaTextToVideoGenerationNode(comfy_io.ComfyNode): async with aiohttp.ClientSession() as session: async with session.get(response_poll.assets.video) as vid_response: - return comfy_io.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) + return IO.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) -class LumaImageToVideoGenerationNode(comfy_io.ComfyNode): +class LumaImageToVideoGenerationNode(IO.ComfyNode): """ Generates videos synchronously based on prompt, input images, and output_size. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="LumaImageToVideoNode", display_name="Luma Image to Video", category="api node/video/Luma", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the video generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=LumaVideoModel, ), - # comfy_io.Combo.Input( + # IO.Combo.Input( # "aspect_ratio", # options=[ratio.value for ratio in LumaAspectRatio], # default=LumaAspectRatio.ratio_16_9, # ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=LumaVideoOutputResolution, default=LumaVideoOutputResolution.res_540p, ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", options=[dur.value for dur in LumaVideoModelOutputDuration], ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "loop", default=False, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -623,27 +623,27 @@ class LumaImageToVideoGenerationNode(comfy_io.ComfyNode): control_after_generate=True, tooltip="Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", ), - comfy_io.Image.Input( + IO.Image.Input( "first_image", tooltip="First frame of generated video.", optional=True, ), - comfy_io.Image.Input( + IO.Image.Input( "last_image", tooltip="Last frame of generated video.", optional=True, ), - comfy_io.Custom(LumaIO.LUMA_CONCEPTS).Input( + IO.Custom(LumaIO.LUMA_CONCEPTS).Input( "luma_concepts", tooltip="Optional Camera Concepts to dictate camera motion via the Luma Concepts node.", optional=True, ) ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -660,7 +660,7 @@ class LumaImageToVideoGenerationNode(comfy_io.ComfyNode): first_image: torch.Tensor = None, last_image: torch.Tensor = None, luma_concepts: LumaConceptChain = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: if first_image is None and last_image is None: raise Exception( "At least one of first_image and last_image requires an input." @@ -716,7 +716,7 @@ class LumaImageToVideoGenerationNode(comfy_io.ComfyNode): async with aiohttp.ClientSession() as session: async with session.get(response_poll.assets.video) as vid_response: - return comfy_io.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) + return IO.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) @classmethod async def _convert_to_keyframes( @@ -744,7 +744,7 @@ class LumaImageToVideoGenerationNode(comfy_io.ComfyNode): class LumaExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ LumaImageGenerationNode, LumaImageModifyNode, diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index caa3d4260..23be1ae65 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -4,7 +4,7 @@ import logging import torch from typing_extensions import override -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, IO from comfy_api.input_impl.video_types import VideoFromFile from comfy_api_nodes.apis import ( MinimaxVideoGenerationRequest, @@ -43,7 +43,7 @@ async def _generate_mm_video( image: Optional[torch.Tensor] = None, # used for ImageToVideo subject: Optional[torch.Tensor] = None, # used for SubjectToVideo average_duration: Optional[int] = None, -) -> comfy_io.NodeOutput: +) -> IO.NodeOutput: if image is None: validate_string(prompt_text, field_name="prompt_text") # upload image, if passed in @@ -133,35 +133,35 @@ async def _generate_mm_video( error_msg = f"Failed to download video from {file_url}" logging.error(error_msg) raise Exception(error_msg) - return comfy_io.NodeOutput(VideoFromFile(video_io)) + return IO.NodeOutput(VideoFromFile(video_io)) -class MinimaxTextToVideoNode(comfy_io.ComfyNode): +class MinimaxTextToVideoNode(IO.ComfyNode): """ Generates videos synchronously based on a prompt, and optional parameters using MiniMax's API. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MinimaxTextToVideoNode", display_name="MiniMax Text to Video", category="api node/video/MiniMax", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt_text", multiline=True, default="", tooltip="Text prompt to guide the video generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["T2V-01", "T2V-01-Director"], default="T2V-01", tooltip="Model to use for video generation", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -172,11 +172,11 @@ class MinimaxTextToVideoNode(comfy_io.ComfyNode): optional=True, ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -187,7 +187,7 @@ class MinimaxTextToVideoNode(comfy_io.ComfyNode): prompt_text: str, model: str = "T2V-01", seed: int = 0, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: return await _generate_mm_video( auth={ "auth_token": cls.hidden.auth_token_comfy_org, @@ -203,36 +203,36 @@ class MinimaxTextToVideoNode(comfy_io.ComfyNode): ) -class MinimaxImageToVideoNode(comfy_io.ComfyNode): +class MinimaxImageToVideoNode(IO.ComfyNode): """ Generates videos synchronously based on an image and prompt, and optional parameters using MiniMax's API. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MinimaxImageToVideoNode", display_name="MiniMax Image to Video", category="api node/video/MiniMax", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="Image to use as first frame of video generation", ), - comfy_io.String.Input( + IO.String.Input( "prompt_text", multiline=True, default="", tooltip="Text prompt to guide the video generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["I2V-01-Director", "I2V-01", "I2V-01-live"], default="I2V-01", tooltip="Model to use for video generation", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -243,11 +243,11 @@ class MinimaxImageToVideoNode(comfy_io.ComfyNode): optional=True, ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -259,7 +259,7 @@ class MinimaxImageToVideoNode(comfy_io.ComfyNode): prompt_text: str, model: str = "I2V-01", seed: int = 0, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: return await _generate_mm_video( auth={ "auth_token": cls.hidden.auth_token_comfy_org, @@ -275,36 +275,36 @@ class MinimaxImageToVideoNode(comfy_io.ComfyNode): ) -class MinimaxSubjectToVideoNode(comfy_io.ComfyNode): +class MinimaxSubjectToVideoNode(IO.ComfyNode): """ Generates videos synchronously based on an image and prompt, and optional parameters using MiniMax's API. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MinimaxSubjectToVideoNode", display_name="MiniMax Subject to Video", category="api node/video/MiniMax", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input( + IO.Image.Input( "subject", tooltip="Image of subject to reference for video generation", ), - comfy_io.String.Input( + IO.String.Input( "prompt_text", multiline=True, default="", tooltip="Text prompt to guide the video generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["S2V-01"], default="S2V-01", tooltip="Model to use for video generation", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -315,11 +315,11 @@ class MinimaxSubjectToVideoNode(comfy_io.ComfyNode): optional=True, ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -331,7 +331,7 @@ class MinimaxSubjectToVideoNode(comfy_io.ComfyNode): prompt_text: str, model: str = "S2V-01", seed: int = 0, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: return await _generate_mm_video( auth={ "auth_token": cls.hidden.auth_token_comfy_org, @@ -347,24 +347,24 @@ class MinimaxSubjectToVideoNode(comfy_io.ComfyNode): ) -class MinimaxHailuoVideoNode(comfy_io.ComfyNode): +class MinimaxHailuoVideoNode(IO.ComfyNode): """Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MinimaxHailuoVideoNode", display_name="MiniMax Hailuo Video", category="api node/video/MiniMax", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt_text", multiline=True, default="", tooltip="Text prompt to guide the video generation.", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -374,25 +374,25 @@ class MinimaxHailuoVideoNode(comfy_io.ComfyNode): tooltip="The random seed used for creating the noise.", optional=True, ), - comfy_io.Image.Input( + IO.Image.Input( "first_frame_image", tooltip="Optional image to use as the first frame to generate a video.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_optimizer", default=True, tooltip="Optimize prompt to improve generation quality when needed.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", options=[6, 10], default=6, tooltip="The length of the output video in seconds.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=["768P", "1080P"], default="768P", @@ -400,11 +400,11 @@ class MinimaxHailuoVideoNode(comfy_io.ComfyNode): optional=True, ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -419,7 +419,7 @@ class MinimaxHailuoVideoNode(comfy_io.ComfyNode): duration: int = 6, resolution: str = "768P", model: str = "MiniMax-Hailuo-02", - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: auth = { "auth_token": cls.hidden.auth_token_comfy_org, "comfy_api_key": cls.hidden.api_key_comfy_org, @@ -513,12 +513,12 @@ class MinimaxHailuoVideoNode(comfy_io.ComfyNode): error_msg = f"Failed to download video from {file_url}" logging.error(error_msg) raise Exception(error_msg) - return comfy_io.NodeOutput(VideoFromFile(video_io)) + return IO.NodeOutput(VideoFromFile(video_io)) class MinimaxExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ MinimaxTextToVideoNode, MinimaxImageToVideoNode, diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 77e4b536c..7566188dd 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -22,10 +22,11 @@ from comfy_api_nodes.apinode_utils import ( download_url_to_video_output, upload_images_to_comfyapi, upload_video_to_comfyapi, + validate_container_format_is_mp4, ) from comfy_api.input import VideoInput -from comfy_api.latest import ComfyExtension, InputImpl, io as comfy_io +from comfy_api.latest import ComfyExtension, InputImpl, IO import av import io @@ -144,7 +145,7 @@ def validate_video_to_video_input(video: VideoInput) -> VideoInput: """ width, height = _get_video_dimensions(video) _validate_video_dimensions(width, height) - _validate_container_format(video) + validate_container_format_is_mp4(video) return _validate_and_trim_duration(video) @@ -177,15 +178,6 @@ def _validate_video_dimensions(width: int, height: int) -> None: ) -def _validate_container_format(video: VideoInput) -> None: - """Validates video container format is MP4.""" - container_format = video.get_container_format() - if container_format not in ["mp4", "mov,mp4,m4a,3gp,3g2,mj2"]: - raise ValueError( - f"Only MP4 container format supported. Got: {container_format}" - ) - - def _validate_and_trim_duration(video: VideoInput) -> VideoInput: """Validates video duration and trims to 5 seconds if needed.""" duration = video.get_duration() @@ -362,25 +354,25 @@ async def get_response( ) -class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): +class MoonvalleyImg2VideoNode(IO.ComfyNode): @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MoonvalleyImg2VideoNode", display_name="Moonvalley Marey Image to Video", category="api node/video/Moonvalley Marey", description="Moonvalley Marey Image to Video Node", inputs=[ - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="The reference image used to generate the video", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default=" gopro, bright, contrast, static, overexposed, vignette, " @@ -391,7 +383,7 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): "wobbly, weird, low quality, plastic, stock footage, video camera, boring", tooltip="Negative prompt text", ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=[ "16:9 (1920 x 1080)", @@ -404,7 +396,7 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): default="16:9 (1920 x 1080)", tooltip="Resolution of the output video", ), - comfy_io.Float.Input( + IO.Float.Input( "prompt_adherence", default=4.5, min=1.0, @@ -412,17 +404,17 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): step=1.0, tooltip="Guidance scale for generation control", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=9, min=0, max=4294967295, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Random seed value", control_after_generate=True, ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=33, min=1, @@ -431,11 +423,11 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): tooltip="Number of denoising steps", ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -450,7 +442,7 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): prompt_adherence: float, seed: int, steps: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_image_dimensions(image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH) validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = parse_width_height_from_res(resolution) @@ -500,25 +492,25 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): task_id, auth_kwargs=auth, node_id=cls.hidden.unique_id ) video = await download_url_to_video_output(final_response.output_url) - return comfy_io.NodeOutput(video) + return IO.NodeOutput(video) -class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): +class MoonvalleyVideo2VideoNode(IO.ComfyNode): @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MoonvalleyVideo2VideoNode", display_name="Moonvalley Marey Video to Video", category="api node/video/Moonvalley Marey", description="", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="Describes the video to generate", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default=" gopro, bright, contrast, static, overexposed, vignette, " @@ -529,28 +521,28 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): "wobbly, weird, low quality, plastic, stock footage, video camera, boring", tooltip="Negative prompt text", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=9, min=0, max=4294967295, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Random seed value", control_after_generate=False, ), - comfy_io.Video.Input( + IO.Video.Input( "video", tooltip="The reference video used to generate the output video. Must be at least 5 seconds long. " "Videos longer than 5s will be automatically trimmed. Only MP4 format supported.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "control_type", options=["Motion Transfer", "Pose Transfer"], default="Motion Transfer", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "motion_intensity", default=100, min=0, @@ -559,21 +551,21 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): tooltip="Only used if control_type is 'Motion Transfer'", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=33, min=1, max=100, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Number of inference steps", ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -589,7 +581,7 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): motion_intensity: Optional[int] = 100, steps=33, prompt_adherence=4.5, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: auth = { "auth_token": cls.hidden.auth_token_comfy_org, "comfy_api_key": cls.hidden.api_key_comfy_org, @@ -641,24 +633,24 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): ) video = await download_url_to_video_output(final_response.output_url) - return comfy_io.NodeOutput(video) + return IO.NodeOutput(video) -class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): +class MoonvalleyTxt2VideoNode(IO.ComfyNode): @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MoonvalleyTxt2VideoNode", display_name="Moonvalley Marey Text to Video", category="api node/video/Moonvalley Marey", description="", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default=" gopro, bright, contrast, static, overexposed, vignette, " @@ -669,7 +661,7 @@ class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): "wobbly, weird, low quality, plastic, stock footage, video camera, boring", tooltip="Negative prompt text", ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=[ "16:9 (1920 x 1080)", @@ -682,7 +674,7 @@ class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): default="16:9 (1920 x 1080)", tooltip="Resolution of the output video", ), - comfy_io.Float.Input( + IO.Float.Input( "prompt_adherence", default=4.0, min=1.0, @@ -690,17 +682,17 @@ class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): step=1.0, tooltip="Guidance scale for generation control", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=9, min=0, max=4294967295, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Random seed value", ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=33, min=1, @@ -709,11 +701,11 @@ class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): tooltip="Inference steps", ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -727,7 +719,7 @@ class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): prompt_adherence: float, seed: int, steps: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = parse_width_height_from_res(resolution) @@ -768,12 +760,12 @@ class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): ) video = await download_url_to_video_output(final_response.output_url) - return comfy_io.NodeOutput(video) + return IO.NodeOutput(video) class MoonvalleyExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ MoonvalleyImg2VideoNode, MoonvalleyTxt2VideoNode, diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py index 822cfee64..27cb0067b 100644 --- a/comfy_api_nodes/nodes_pika.py +++ b/comfy_api_nodes/nodes_pika.py @@ -12,7 +12,7 @@ from typing import Optional, TypeVar import torch from typing_extensions import override -from comfy_api.latest import ComfyExtension, comfy_io +from comfy_api.latest import ComfyExtension, IO from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoInput from comfy_api_nodes.apinode_utils import ( download_url_to_video_output, @@ -47,7 +47,7 @@ async def execute_task( initial_operation: SynchronousOperation[R, pika_defs.PikaGenerateResponse], auth_kwargs: Optional[dict[str, str]] = None, node_id: Optional[str] = None, -) -> comfy_io.NodeOutput: +) -> IO.NodeOutput: task_id = (await initial_operation.execute()).video_id final_response: pika_defs.PikaVideoResponse = await PollingOperation( poll_endpoint=ApiEndpoint( @@ -72,39 +72,39 @@ async def execute_task( raise Exception(error_msg) video_url = final_response.url logging.info("Pika task %s succeeded. Video URL: %s", task_id, video_url) - return comfy_io.NodeOutput(await download_url_to_video_output(video_url)) + return IO.NodeOutput(await download_url_to_video_output(video_url)) -def get_base_inputs_types() -> list[comfy_io.Input]: +def get_base_inputs_types() -> list[IO.Input]: """Get the base required inputs types common to all Pika nodes.""" return [ - comfy_io.String.Input("prompt_text", multiline=True), - comfy_io.String.Input("negative_prompt", multiline=True), - comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), - comfy_io.Combo.Input("resolution", options=["1080p", "720p"], default="1080p"), - comfy_io.Combo.Input("duration", options=[5, 10], default=5), + IO.String.Input("prompt_text", multiline=True), + IO.String.Input("negative_prompt", multiline=True), + IO.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), + IO.Combo.Input("resolution", options=["1080p", "720p"], default="1080p"), + IO.Combo.Input("duration", options=[5, 10], default=5), ] -class PikaImageToVideo(comfy_io.ComfyNode): +class PikaImageToVideo(IO.ComfyNode): """Pika 2.2 Image to Video Node.""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="PikaImageToVideoNode2_2", display_name="Pika Image to Video", description="Sends an image and prompt to the Pika API v2.2 to generate a video.", category="api node/video/Pika", inputs=[ - comfy_io.Image.Input("image", tooltip="The image to convert to video"), + IO.Image.Input("image", tooltip="The image to convert to video"), *get_base_inputs_types(), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -118,7 +118,7 @@ class PikaImageToVideo(comfy_io.ComfyNode): seed: int, resolution: str, duration: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: image_bytes_io = tensor_to_bytesio(image) pika_files = {"image": ("image.png", image_bytes_io, "image/png")} pika_request_data = pika_defs.PikaBodyGenerate22I2vGenerate22I2vPost( @@ -147,19 +147,19 @@ class PikaImageToVideo(comfy_io.ComfyNode): return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaTextToVideoNode(comfy_io.ComfyNode): +class PikaTextToVideoNode(IO.ComfyNode): """Pika Text2Video v2.2 Node.""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="PikaTextToVideoNode2_2", display_name="Pika Text to Video", description="Sends a text prompt to the Pika API v2.2 to generate a video.", category="api node/video/Pika", inputs=[ *get_base_inputs_types(), - comfy_io.Float.Input( + IO.Float.Input( "aspect_ratio", step=0.001, min=0.4, @@ -168,11 +168,11 @@ class PikaTextToVideoNode(comfy_io.ComfyNode): tooltip="Aspect ratio (width / height)", ) ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -186,7 +186,7 @@ class PikaTextToVideoNode(comfy_io.ComfyNode): resolution: str, duration: int, aspect_ratio: float, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: auth = { "auth_token": cls.hidden.auth_token_comfy_org, "comfy_api_key": cls.hidden.api_key_comfy_org, @@ -212,24 +212,24 @@ class PikaTextToVideoNode(comfy_io.ComfyNode): return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaScenes(comfy_io.ComfyNode): +class PikaScenes(IO.ComfyNode): """PikaScenes v2.2 Node.""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="PikaScenesV2_2", display_name="Pika Scenes (Video Image Composition)", description="Combine your images to create a video with the objects in them. Upload multiple images as ingredients and generate a high-quality video that incorporates all of them.", category="api node/video/Pika", inputs=[ *get_base_inputs_types(), - comfy_io.Combo.Input( + IO.Combo.Input( "ingredients_mode", options=["creative", "precise"], default="creative", ), - comfy_io.Float.Input( + IO.Float.Input( "aspect_ratio", step=0.001, min=0.4, @@ -237,37 +237,37 @@ class PikaScenes(comfy_io.ComfyNode): default=1.7777777777777777, tooltip="Aspect ratio (width / height)", ), - comfy_io.Image.Input( + IO.Image.Input( "image_ingredient_1", optional=True, tooltip="Image that will be used as ingredient to create a video.", ), - comfy_io.Image.Input( + IO.Image.Input( "image_ingredient_2", optional=True, tooltip="Image that will be used as ingredient to create a video.", ), - comfy_io.Image.Input( + IO.Image.Input( "image_ingredient_3", optional=True, tooltip="Image that will be used as ingredient to create a video.", ), - comfy_io.Image.Input( + IO.Image.Input( "image_ingredient_4", optional=True, tooltip="Image that will be used as ingredient to create a video.", ), - comfy_io.Image.Input( + IO.Image.Input( "image_ingredient_5", optional=True, tooltip="Image that will be used as ingredient to create a video.", ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -287,7 +287,7 @@ class PikaScenes(comfy_io.ComfyNode): image_ingredient_3: Optional[torch.Tensor] = None, image_ingredient_4: Optional[torch.Tensor] = None, image_ingredient_5: Optional[torch.Tensor] = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: all_image_bytes_io = [] for image in [ image_ingredient_1, @@ -333,33 +333,33 @@ class PikaScenes(comfy_io.ComfyNode): return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikAdditionsNode(comfy_io.ComfyNode): +class PikAdditionsNode(IO.ComfyNode): """Pika Pikadditions Node. Add an image into a video.""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="Pikadditions", display_name="Pikadditions (Video Object Insertion)", description="Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result.", category="api node/video/Pika", inputs=[ - comfy_io.Video.Input("video", tooltip="The video to add an image to."), - comfy_io.Image.Input("image", tooltip="The image to add to the video."), - comfy_io.String.Input("prompt_text", multiline=True), - comfy_io.String.Input("negative_prompt", multiline=True), - comfy_io.Int.Input( + IO.Video.Input("video", tooltip="The video to add an image to."), + IO.Image.Input("image", tooltip="The image to add to the video."), + IO.String.Input("prompt_text", multiline=True), + IO.String.Input("negative_prompt", multiline=True), + IO.Int.Input( "seed", min=0, max=0xFFFFFFFF, control_after_generate=True, ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -372,7 +372,7 @@ class PikAdditionsNode(comfy_io.ComfyNode): prompt_text: str, negative_prompt: str, seed: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: video_bytes_io = BytesIO() video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264) video_bytes_io.seek(0) @@ -407,43 +407,43 @@ class PikAdditionsNode(comfy_io.ComfyNode): return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaSwapsNode(comfy_io.ComfyNode): +class PikaSwapsNode(IO.ComfyNode): """Pika Pikaswaps Node.""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="Pikaswaps", display_name="Pika Swaps (Video Object Replacement)", description="Swap out any object or region of your video with a new image or object. Define areas to replace either with a mask or coordinates.", category="api node/video/Pika", inputs=[ - comfy_io.Video.Input("video", tooltip="The video to swap an object in."), - comfy_io.Image.Input( + IO.Video.Input("video", tooltip="The video to swap an object in."), + IO.Image.Input( "image", tooltip="The image used to replace the masked object in the video.", optional=True, ), - comfy_io.Mask.Input( + IO.Mask.Input( "mask", tooltip="Use the mask to define areas in the video to replace.", optional=True, ), - comfy_io.String.Input("prompt_text", multiline=True, optional=True), - comfy_io.String.Input("negative_prompt", multiline=True, optional=True), - comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True, optional=True), - comfy_io.String.Input( + IO.String.Input("prompt_text", multiline=True, optional=True), + IO.String.Input("negative_prompt", multiline=True, optional=True), + IO.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True, optional=True), + IO.String.Input( "region_to_modify", multiline=True, optional=True, tooltip="Plaintext description of the object / region to modify.", ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -458,7 +458,7 @@ class PikaSwapsNode(comfy_io.ComfyNode): negative_prompt: str = "", seed: int = 0, region_to_modify: str = "", - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: video_bytes_io = BytesIO() video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264) video_bytes_io.seek(0) @@ -495,30 +495,30 @@ class PikaSwapsNode(comfy_io.ComfyNode): return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaffectsNode(comfy_io.ComfyNode): +class PikaffectsNode(IO.ComfyNode): """Pika Pikaffects Node.""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="Pikaffects", display_name="Pikaffects (Video Effects)", description="Generate a video with a specific Pikaffect. Supported Pikaffects: Cake-ify, Crumble, Crush, Decapitate, Deflate, Dissolve, Explode, Eye-pop, Inflate, Levitate, Melt, Peel, Poke, Squish, Ta-da, Tear", category="api node/video/Pika", inputs=[ - comfy_io.Image.Input("image", tooltip="The reference image to apply the Pikaffect to."), - comfy_io.Combo.Input( + IO.Image.Input("image", tooltip="The reference image to apply the Pikaffect to."), + IO.Combo.Input( "pikaffect", options=pika_defs.Pikaffect, default="Cake-ify" ), - comfy_io.String.Input("prompt_text", multiline=True), - comfy_io.String.Input("negative_prompt", multiline=True), - comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), + IO.String.Input("prompt_text", multiline=True), + IO.String.Input("negative_prompt", multiline=True), + IO.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -531,7 +531,7 @@ class PikaffectsNode(comfy_io.ComfyNode): prompt_text: str, negative_prompt: str, seed: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: auth = { "auth_token": cls.hidden.auth_token_comfy_org, "comfy_api_key": cls.hidden.api_key_comfy_org, @@ -556,26 +556,26 @@ class PikaffectsNode(comfy_io.ComfyNode): return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) -class PikaStartEndFrameNode(comfy_io.ComfyNode): +class PikaStartEndFrameNode(IO.ComfyNode): """PikaFrames v2.2 Node.""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="PikaStartEndFrameNode2_2", display_name="Pika Start and End Frame to Video", description="Generate a video by combining your first and last frame. Upload two images to define the start and end points, and let the AI create a smooth transition between them.", category="api node/video/Pika", inputs=[ - comfy_io.Image.Input("image_start", tooltip="The first image to combine."), - comfy_io.Image.Input("image_end", tooltip="The last image to combine."), + IO.Image.Input("image_start", tooltip="The first image to combine."), + IO.Image.Input("image_end", tooltip="The last image to combine."), *get_base_inputs_types(), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -590,7 +590,7 @@ class PikaStartEndFrameNode(comfy_io.ComfyNode): seed: int, resolution: str, duration: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt_text, field_name="prompt_text", min_length=1) pika_files = [ ("keyFrames", ("image_start.png", tensor_to_bytesio(image_start), "image/png")), @@ -623,7 +623,7 @@ class PikaStartEndFrameNode(comfy_io.ComfyNode): class PikaApiNodesExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ PikaImageToVideo, PikaTextToVideoNode, diff --git a/comfy_api_nodes/nodes_pixverse.py b/comfy_api_nodes/nodes_pixverse.py index a97610f06..438a7f80b 100644 --- a/comfy_api_nodes/nodes_pixverse.py +++ b/comfy_api_nodes/nodes_pixverse.py @@ -29,7 +29,7 @@ from comfy_api_nodes.apinode_utils import ( validate_string, ) from comfy_api.input_impl import VideoFromFile -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, IO import torch import aiohttp @@ -73,69 +73,69 @@ async def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None): return response_upload.Resp.img_id -class PixverseTemplateNode(comfy_io.ComfyNode): +class PixverseTemplateNode(IO.ComfyNode): """ Select template for PixVerse Video generation. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="PixverseTemplateNode", display_name="PixVerse Template", category="api node/video/PixVerse", inputs=[ - comfy_io.Combo.Input("template", options=list(pixverse_templates.keys())), + IO.Combo.Input("template", options=list(pixverse_templates.keys())), ], - outputs=[comfy_io.Custom(PixverseIO.TEMPLATE).Output(display_name="pixverse_template")], + outputs=[IO.Custom(PixverseIO.TEMPLATE).Output(display_name="pixverse_template")], ) @classmethod - def execute(cls, template: str) -> comfy_io.NodeOutput: + def execute(cls, template: str) -> IO.NodeOutput: template_id = pixverse_templates.get(template, None) if template_id is None: raise Exception(f"Template '{template}' is not recognized.") # just return the integer - return comfy_io.NodeOutput(template_id) + return IO.NodeOutput(template_id) -class PixverseTextToVideoNode(comfy_io.ComfyNode): +class PixverseTextToVideoNode(IO.ComfyNode): """ Generates videos based on prompt and output_size. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="PixverseTextToVideoNode", display_name="PixVerse Text to Video", category="api node/video/PixVerse", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the video generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=PixverseAspectRatio, ), - comfy_io.Combo.Input( + IO.Combo.Input( "quality", options=PixverseQuality, default=PixverseQuality.res_540p, ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration_seconds", options=PixverseDuration, ), - comfy_io.Combo.Input( + IO.Combo.Input( "motion_mode", options=PixverseMotionMode, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -143,24 +143,24 @@ class PixverseTextToVideoNode(comfy_io.ComfyNode): control_after_generate=True, tooltip="Seed for video generation.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", default="", multiline=True, tooltip="An optional text description of undesired elements on an image.", optional=True, ), - comfy_io.Custom(PixverseIO.TEMPLATE).Input( + IO.Custom(PixverseIO.TEMPLATE).Input( "pixverse_template", tooltip="An optional template to influence style of generation, created by the PixVerse Template node.", optional=True, ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -176,7 +176,7 @@ class PixverseTextToVideoNode(comfy_io.ComfyNode): seed, negative_prompt: str = None, pixverse_template: int = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration @@ -237,43 +237,43 @@ class PixverseTextToVideoNode(comfy_io.ComfyNode): async with aiohttp.ClientSession() as session: async with session.get(response_poll.Resp.url) as vid_response: - return comfy_io.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) + return IO.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) -class PixverseImageToVideoNode(comfy_io.ComfyNode): +class PixverseImageToVideoNode(IO.ComfyNode): """ Generates videos based on prompt and output_size. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="PixverseImageToVideoNode", display_name="PixVerse Image to Video", category="api node/video/PixVerse", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("image"), - comfy_io.String.Input( + IO.Image.Input("image"), + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the video generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "quality", options=PixverseQuality, default=PixverseQuality.res_540p, ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration_seconds", options=PixverseDuration, ), - comfy_io.Combo.Input( + IO.Combo.Input( "motion_mode", options=PixverseMotionMode, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -281,24 +281,24 @@ class PixverseImageToVideoNode(comfy_io.ComfyNode): control_after_generate=True, tooltip="Seed for video generation.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", default="", multiline=True, tooltip="An optional text description of undesired elements on an image.", optional=True, ), - comfy_io.Custom(PixverseIO.TEMPLATE).Input( + IO.Custom(PixverseIO.TEMPLATE).Input( "pixverse_template", tooltip="An optional template to influence style of generation, created by the PixVerse Template node.", optional=True, ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -314,7 +314,7 @@ class PixverseImageToVideoNode(comfy_io.ComfyNode): seed, negative_prompt: str = None, pixverse_template: int = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) auth = { "auth_token": cls.hidden.auth_token_comfy_org, @@ -377,44 +377,44 @@ class PixverseImageToVideoNode(comfy_io.ComfyNode): async with aiohttp.ClientSession() as session: async with session.get(response_poll.Resp.url) as vid_response: - return comfy_io.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) + return IO.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) -class PixverseTransitionVideoNode(comfy_io.ComfyNode): +class PixverseTransitionVideoNode(IO.ComfyNode): """ Generates videos based on prompt and output_size. """ @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="PixverseTransitionVideoNode", display_name="PixVerse Transition Video", category="api node/video/PixVerse", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("first_frame"), - comfy_io.Image.Input("last_frame"), - comfy_io.String.Input( + IO.Image.Input("first_frame"), + IO.Image.Input("last_frame"), + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the video generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "quality", options=PixverseQuality, default=PixverseQuality.res_540p, ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration_seconds", options=PixverseDuration, ), - comfy_io.Combo.Input( + IO.Combo.Input( "motion_mode", options=PixverseMotionMode, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -422,7 +422,7 @@ class PixverseTransitionVideoNode(comfy_io.ComfyNode): control_after_generate=True, tooltip="Seed for video generation.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", default="", multiline=True, @@ -430,11 +430,11 @@ class PixverseTransitionVideoNode(comfy_io.ComfyNode): optional=True, ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -450,7 +450,7 @@ class PixverseTransitionVideoNode(comfy_io.ComfyNode): motion_mode: str, seed, negative_prompt: str = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) auth = { "auth_token": cls.hidden.auth_token_comfy_org, @@ -514,12 +514,12 @@ class PixverseTransitionVideoNode(comfy_io.ComfyNode): async with aiohttp.ClientSession() as session: async with session.get(response_poll.Resp.url) as vid_response: - return comfy_io.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) + return IO.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) class PixVerseExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ PixverseTextToVideoNode, PixverseImageToVideoNode, diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index 0eb762a1c..cf2172bd6 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -32,20 +32,20 @@ from comfy_api_nodes.apis.client import ( SynchronousOperation, PollingOperation, ) -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, IO COMMON_PARAMETERS = [ - comfy_io.Int.Input( + IO.Int.Input( "Seed", default=0, min=0, max=65535, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), - comfy_io.Combo.Input("Material_Type", options=["PBR", "Shaded"], default="PBR", optional=True), - comfy_io.Combo.Input( + IO.Combo.Input("Material_Type", options=["PBR", "Shaded"], default="PBR", optional=True), + IO.Combo.Input( "Polygon_count", options=["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "200K-Triangle"], default="18K-Quad", @@ -259,24 +259,24 @@ async def download_files(url_list, task_uuid): return model_file_path -class Rodin3D_Regular(comfy_io.ComfyNode): +class Rodin3D_Regular(IO.ComfyNode): """Generate 3D Assets using Rodin API""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="Rodin3D_Regular", display_name="Rodin 3D Generate - Regular Generate", category="api node/3d/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("Images"), + IO.Image.Input("Images"), *COMMON_PARAMETERS, ], - outputs=[comfy_io.String.Output(display_name="3D Model Path")], + outputs=[IO.String.Output(display_name="3D Model Path")], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, ], is_api_node=True, ) @@ -288,7 +288,7 @@ class Rodin3D_Regular(comfy_io.ComfyNode): Seed, Material_Type, Polygon_count, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: tier = "Regular" num_images = Images.shape[0] m_images = [] @@ -312,27 +312,27 @@ class Rodin3D_Regular(comfy_io.ComfyNode): download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) model = await download_files(download_list, task_uuid) - return comfy_io.NodeOutput(model) + return IO.NodeOutput(model) -class Rodin3D_Detail(comfy_io.ComfyNode): +class Rodin3D_Detail(IO.ComfyNode): """Generate 3D Assets using Rodin API""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="Rodin3D_Detail", display_name="Rodin 3D Generate - Detail Generate", category="api node/3d/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("Images"), + IO.Image.Input("Images"), *COMMON_PARAMETERS, ], - outputs=[comfy_io.String.Output(display_name="3D Model Path")], + outputs=[IO.String.Output(display_name="3D Model Path")], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, ], is_api_node=True, ) @@ -344,7 +344,7 @@ class Rodin3D_Detail(comfy_io.ComfyNode): Seed, Material_Type, Polygon_count, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: tier = "Detail" num_images = Images.shape[0] m_images = [] @@ -368,27 +368,27 @@ class Rodin3D_Detail(comfy_io.ComfyNode): download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) model = await download_files(download_list, task_uuid) - return comfy_io.NodeOutput(model) + return IO.NodeOutput(model) -class Rodin3D_Smooth(comfy_io.ComfyNode): +class Rodin3D_Smooth(IO.ComfyNode): """Generate 3D Assets using Rodin API""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="Rodin3D_Smooth", display_name="Rodin 3D Generate - Smooth Generate", category="api node/3d/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("Images"), + IO.Image.Input("Images"), *COMMON_PARAMETERS, ], - outputs=[comfy_io.String.Output(display_name="3D Model Path")], + outputs=[IO.String.Output(display_name="3D Model Path")], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, ], is_api_node=True, ) @@ -400,7 +400,7 @@ class Rodin3D_Smooth(comfy_io.ComfyNode): Seed, Material_Type, Polygon_count, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: tier = "Smooth" num_images = Images.shape[0] m_images = [] @@ -424,34 +424,34 @@ class Rodin3D_Smooth(comfy_io.ComfyNode): download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) model = await download_files(download_list, task_uuid) - return comfy_io.NodeOutput(model) + return IO.NodeOutput(model) -class Rodin3D_Sketch(comfy_io.ComfyNode): +class Rodin3D_Sketch(IO.ComfyNode): """Generate 3D Assets using Rodin API""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="Rodin3D_Sketch", display_name="Rodin 3D Generate - Sketch Generate", category="api node/3d/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("Images"), - comfy_io.Int.Input( + IO.Image.Input("Images"), + IO.Int.Input( "Seed", default=0, min=0, max=65535, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), ], - outputs=[comfy_io.String.Output(display_name="3D Model Path")], + outputs=[IO.String.Output(display_name="3D Model Path")], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, ], is_api_node=True, ) @@ -461,7 +461,7 @@ class Rodin3D_Sketch(comfy_io.ComfyNode): cls, Images, Seed, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: tier = "Sketch" num_images = Images.shape[0] m_images = [] @@ -487,42 +487,42 @@ class Rodin3D_Sketch(comfy_io.ComfyNode): download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) model = await download_files(download_list, task_uuid) - return comfy_io.NodeOutput(model) + return IO.NodeOutput(model) -class Rodin3D_Gen2(comfy_io.ComfyNode): +class Rodin3D_Gen2(IO.ComfyNode): """Generate 3D Assets using Rodin API""" @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="Rodin3D_Gen2", display_name="Rodin 3D Generate - Gen-2 Generate", category="api node/3d/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("Images"), - comfy_io.Int.Input( + IO.Image.Input("Images"), + IO.Int.Input( "Seed", default=0, min=0, max=65535, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), - comfy_io.Combo.Input("Material_Type", options=["PBR", "Shaded"], default="PBR", optional=True), - comfy_io.Combo.Input( + IO.Combo.Input("Material_Type", options=["PBR", "Shaded"], default="PBR", optional=True), + IO.Combo.Input( "Polygon_count", options=["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "2K-Triangle", "20K-Triangle", "150K-Triangle", "500K-Triangle"], default="500K-Triangle", optional=True, ), - comfy_io.Boolean.Input("TAPose", default=False), + IO.Boolean.Input("TAPose", default=False), ], - outputs=[comfy_io.String.Output(display_name="3D Model Path")], + outputs=[IO.String.Output(display_name="3D Model Path")], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, ], is_api_node=True, ) @@ -535,7 +535,7 @@ class Rodin3D_Gen2(comfy_io.ComfyNode): Material_Type, Polygon_count, TAPose, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: tier = "Gen-2" num_images = Images.shape[0] m_images = [] @@ -560,12 +560,12 @@ class Rodin3D_Gen2(comfy_io.ComfyNode): download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) model = await download_files(download_list, task_uuid) - return comfy_io.NodeOutput(model) + return IO.NodeOutput(model) class Rodin3DExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ Rodin3D_Regular, Rodin3D_Detail, diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py index ea22692cb..eb03a897d 100644 --- a/comfy_api_nodes/nodes_runway.py +++ b/comfy_api_nodes/nodes_runway.py @@ -48,7 +48,7 @@ from comfy_api_nodes.apinode_utils import ( download_url_to_image_tensor, ) from comfy_api.input_impl import VideoFromFile -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, IO from comfy_api_nodes.util.validation_utils import validate_image_dimensions, validate_image_aspect_ratio PATH_IMAGE_TO_VIDEO = "/proxy/runway/image_to_video" @@ -175,11 +175,11 @@ async def generate_video( return await download_url_to_video_output(video_url) -class RunwayImageToVideoNodeGen3a(comfy_io.ComfyNode): +class RunwayImageToVideoNodeGen3a(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="RunwayImageToVideoNodeGen3a", display_name="Runway Image to Video (Gen3a Turbo)", category="api node/video/Runway", @@ -188,42 +188,42 @@ class RunwayImageToVideoNodeGen3a(comfy_io.ComfyNode): "your input selections will set your generation up for success: " "https://help.runwayml.com/hc/en-us/articles/33927968552339-Creating-with-Act-One-on-Gen-3-Alpha-and-Turbo.", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text prompt for the generation", ), - comfy_io.Image.Input( + IO.Image.Input( "start_frame", tooltip="Start frame to be used for the video", ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", options=Duration, ), - comfy_io.Combo.Input( + IO.Combo.Input( "ratio", options=RunwayGen3aAspectRatio, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967295, step=1, control_after_generate=True, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Random seed for generation", ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -236,7 +236,7 @@ class RunwayImageToVideoNodeGen3a(comfy_io.ComfyNode): duration: str, ratio: str, seed: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, min_length=1) validate_image_dimensions(start_frame, max_width=7999, max_height=7999) validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) @@ -253,7 +253,7 @@ class RunwayImageToVideoNodeGen3a(comfy_io.ComfyNode): auth_kwargs=auth_kwargs, ) - return comfy_io.NodeOutput( + return IO.NodeOutput( await generate_video( RunwayImageToVideoRequest( promptText=prompt, @@ -275,11 +275,11 @@ class RunwayImageToVideoNodeGen3a(comfy_io.ComfyNode): ) -class RunwayImageToVideoNodeGen4(comfy_io.ComfyNode): +class RunwayImageToVideoNodeGen4(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="RunwayImageToVideoNodeGen4", display_name="Runway Image to Video (Gen4 Turbo)", category="api node/video/Runway", @@ -288,42 +288,42 @@ class RunwayImageToVideoNodeGen4(comfy_io.ComfyNode): "your input selections will set your generation up for success: " "https://help.runwayml.com/hc/en-us/articles/37327109429011-Creating-with-Gen-4-Video.", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text prompt for the generation", ), - comfy_io.Image.Input( + IO.Image.Input( "start_frame", tooltip="Start frame to be used for the video", ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", options=Duration, ), - comfy_io.Combo.Input( + IO.Combo.Input( "ratio", options=RunwayGen4TurboAspectRatio, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967295, step=1, control_after_generate=True, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Random seed for generation", ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -336,7 +336,7 @@ class RunwayImageToVideoNodeGen4(comfy_io.ComfyNode): duration: str, ratio: str, seed: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, min_length=1) validate_image_dimensions(start_frame, max_width=7999, max_height=7999) validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) @@ -353,7 +353,7 @@ class RunwayImageToVideoNodeGen4(comfy_io.ComfyNode): auth_kwargs=auth_kwargs, ) - return comfy_io.NodeOutput( + return IO.NodeOutput( await generate_video( RunwayImageToVideoRequest( promptText=prompt, @@ -376,11 +376,11 @@ class RunwayImageToVideoNodeGen4(comfy_io.ComfyNode): ) -class RunwayFirstLastFrameNode(comfy_io.ComfyNode): +class RunwayFirstLastFrameNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="RunwayFirstLastFrameNode", display_name="Runway First-Last-Frame to Video", category="api node/video/Runway", @@ -392,46 +392,46 @@ class RunwayFirstLastFrameNode(comfy_io.ComfyNode): "will set your generation up for success: " "https://help.runwayml.com/hc/en-us/articles/34170748696595-Creating-with-Keyframes-on-Gen-3.", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text prompt for the generation", ), - comfy_io.Image.Input( + IO.Image.Input( "start_frame", tooltip="Start frame to be used for the video", ), - comfy_io.Image.Input( + IO.Image.Input( "end_frame", tooltip="End frame to be used for the video. Supported for gen3a_turbo only.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", options=Duration, ), - comfy_io.Combo.Input( + IO.Combo.Input( "ratio", options=RunwayGen3aAspectRatio, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967295, step=1, control_after_generate=True, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Random seed for generation", ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -445,7 +445,7 @@ class RunwayFirstLastFrameNode(comfy_io.ComfyNode): duration: str, ratio: str, seed: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, min_length=1) validate_image_dimensions(start_frame, max_width=7999, max_height=7999) validate_image_dimensions(end_frame, max_width=7999, max_height=7999) @@ -467,7 +467,7 @@ class RunwayFirstLastFrameNode(comfy_io.ComfyNode): if len(download_urls) != 2: raise RunwayApiError("Failed to upload one or more images to comfy api.") - return comfy_io.NodeOutput( + return IO.NodeOutput( await generate_video( RunwayImageToVideoRequest( promptText=prompt, @@ -493,40 +493,40 @@ class RunwayFirstLastFrameNode(comfy_io.ComfyNode): ) -class RunwayTextToImageNode(comfy_io.ComfyNode): +class RunwayTextToImageNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="RunwayTextToImageNode", display_name="Runway Text to Image", category="api node/image/Runway", description="Generate an image from a text prompt using Runway's Gen 4 model. " "You can also include reference image to guide the generation.", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text prompt for the generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "ratio", options=[model.value for model in RunwayTextToImageAspectRatioEnum], ), - comfy_io.Image.Input( + IO.Image.Input( "reference_image", tooltip="Optional reference image to guide the generation", optional=True, ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -537,7 +537,7 @@ class RunwayTextToImageNode(comfy_io.ComfyNode): prompt: str, ratio: str, reference_image: Optional[torch.Tensor] = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, min_length=1) auth_kwargs = { @@ -588,12 +588,12 @@ class RunwayTextToImageNode(comfy_io.ComfyNode): if not final_response.output: raise RunwayApiError("Runway task succeeded but no image data found in response.") - return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_task_status(final_response))) + return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_task_status(final_response))) class RunwayExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ RunwayFirstLastFrameNode, RunwayImageToVideoNodeGen3a, diff --git a/comfy_api_nodes/nodes_sora.py b/comfy_api_nodes/nodes_sora.py index 2d532d637..efc954869 100644 --- a/comfy_api_nodes/nodes_sora.py +++ b/comfy_api_nodes/nodes_sora.py @@ -3,7 +3,7 @@ from typing_extensions import override import torch from pydantic import BaseModel, Field -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, IO from comfy_api_nodes.apis.client import ( ApiEndpoint, HttpMethod, @@ -31,27 +31,27 @@ class Sora2GenerationResponse(BaseModel): status: Optional[str] = Field(None) -class OpenAIVideoSora2(comfy_io.ComfyNode): +class OpenAIVideoSora2(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="OpenAIVideoSora2", display_name="OpenAI Sora - Video", category="api node/video/Sora", description="OpenAI video and audio generation.", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["sora-2", "sora-2-pro"], default="sora-2", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Guiding text; may be empty if an input image is present.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "size", options=[ "720x1280", @@ -61,22 +61,22 @@ class OpenAIVideoSora2(comfy_io.ComfyNode): ], default="1280x720", ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", options=[4, 8, 12], default=8, ), - comfy_io.Image.Input( + IO.Image.Input( "image", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, optional=True, tooltip="Seed to determine if node should re-run; " @@ -84,12 +84,12 @@ class OpenAIVideoSora2(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -155,7 +155,7 @@ class OpenAIVideoSora2(comfy_io.ComfyNode): estimated_duration=45 * (duration / 4) * model_time_multiplier, ) await poll_operation.execute() - return comfy_io.NodeOutput( + return IO.NodeOutput( await download_url_to_video_output( f"/proxy/openai/v1/videos/{initial_response.id}/content", auth_kwargs=auth, @@ -165,7 +165,7 @@ class OpenAIVideoSora2(comfy_io.ComfyNode): class OpenAISoraExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ OpenAIVideoSora2, ] diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index bfb67fc9d..8af03cfd1 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -2,7 +2,7 @@ from inspect import cleandoc from typing import Optional from typing_extensions import override -from comfy_api.latest import ComfyExtension, Input, io as comfy_io +from comfy_api.latest import ComfyExtension, Input, IO from comfy_api_nodes.apis.stability_api import ( StabilityUpscaleConservativeRequest, StabilityUpscaleCreativeRequest, @@ -56,20 +56,20 @@ def get_async_dummy_status(x: StabilityResultsGetResponse): return StabilityPollStatus.in_progress -class StabilityStableImageUltraNode(comfy_io.ComfyNode): +class StabilityStableImageUltraNode(IO.ComfyNode): """ Generates images synchronously based on prompt and resolution. """ @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityStableImageUltraNode", display_name="Stability AI Stable Image Ultra", category="api node/image/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", @@ -80,39 +80,39 @@ class StabilityStableImageUltraNode(comfy_io.ComfyNode): "is a value between 0 and 1. For example: `The sky was a crisp (blue:0.3) and (green:0.8)`" + "would convey a sky that was blue and green, but more green than blue.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=StabilityAspectRatio, default=StabilityAspectRatio.ratio_1_1, tooltip="Aspect ratio of generated image.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "style_preset", options=get_stability_style_presets(), tooltip="Optional desired style of generated image.", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for creating the noise.", ), - comfy_io.Image.Input( + IO.Image.Input( "image", optional=True, ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", default="", tooltip="A blurb of text describing what you do not wish to see in the output image. This is an advanced feature.", force_input=True, optional=True, ), - comfy_io.Float.Input( + IO.Float.Input( "image_denoise", default=0.5, min=0.0, @@ -123,12 +123,12 @@ class StabilityStableImageUltraNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -143,7 +143,7 @@ class StabilityStableImageUltraNode(comfy_io.ComfyNode): image: Optional[torch.Tensor] = None, negative_prompt: str = "", image_denoise: Optional[float] = 0.5, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) # prepare image binary if image present image_binary = None @@ -193,44 +193,44 @@ class StabilityStableImageUltraNode(comfy_io.ComfyNode): image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return comfy_io.NodeOutput(returned_image) + return IO.NodeOutput(returned_image) -class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): +class StabilityStableImageSD_3_5Node(IO.ComfyNode): """ Generates images synchronously based on prompt and resolution. """ @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityStableImageSD_3_5Node", display_name="Stability AI Stable Diffusion 3.5 Image", category="api node/image/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=Stability_SD3_5_Model, ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=StabilityAspectRatio, default=StabilityAspectRatio.ratio_1_1, tooltip="Aspect ratio of generated image.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "style_preset", options=get_stability_style_presets(), tooltip="Optional desired style of generated image.", ), - comfy_io.Float.Input( + IO.Float.Input( "cfg_scale", default=4.0, min=1.0, @@ -238,28 +238,28 @@ class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): step=0.1, tooltip="How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt)", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for creating the noise.", ), - comfy_io.Image.Input( + IO.Image.Input( "image", optional=True, ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", default="", tooltip="Keywords of what you do not wish to see in the output image. This is an advanced feature.", force_input=True, optional=True, ), - comfy_io.Float.Input( + IO.Float.Input( "image_denoise", default=0.5, min=0.0, @@ -270,12 +270,12 @@ class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -292,7 +292,7 @@ class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): image: Optional[torch.Tensor] = None, negative_prompt: str = "", image_denoise: Optional[float] = 0.5, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) # prepare image binary if image present image_binary = None @@ -348,30 +348,30 @@ class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return comfy_io.NodeOutput(returned_image) + return IO.NodeOutput(returned_image) -class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): +class StabilityUpscaleConservativeNode(IO.ComfyNode): """ Upscale image with minimal alterations to 4K resolution. """ @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityUpscaleConservativeNode", display_name="Stability AI Upscale Conservative", category="api node/image/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("image"), - comfy_io.String.Input( + IO.Image.Input("image"), + IO.String.Input( "prompt", multiline=True, default="", tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results.", ), - comfy_io.Float.Input( + IO.Float.Input( "creativity", default=0.35, min=0.2, @@ -379,17 +379,17 @@ class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): step=0.01, tooltip="Controls the likelihood of creating additional details not heavily conditioned by the init image.", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for creating the noise.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", default="", tooltip="Keywords of what you do not wish to see in the output image. This is an advanced feature.", @@ -398,12 +398,12 @@ class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -416,7 +416,7 @@ class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): creativity: float, seed: int, negative_prompt: str = "", - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read() @@ -457,30 +457,30 @@ class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return comfy_io.NodeOutput(returned_image) + return IO.NodeOutput(returned_image) -class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): +class StabilityUpscaleCreativeNode(IO.ComfyNode): """ Upscale image with minimal alterations to 4K resolution. """ @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityUpscaleCreativeNode", display_name="Stability AI Upscale Creative", category="api node/image/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("image"), - comfy_io.String.Input( + IO.Image.Input("image"), + IO.String.Input( "prompt", multiline=True, default="", tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results.", ), - comfy_io.Float.Input( + IO.Float.Input( "creativity", default=0.3, min=0.1, @@ -488,22 +488,22 @@ class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): step=0.01, tooltip="Controls the likelihood of creating additional details not heavily conditioned by the init image.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "style_preset", options=get_stability_style_presets(), tooltip="Optional desired style of generated image.", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for creating the noise.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", default="", tooltip="Keywords of what you do not wish to see in the output image. This is an advanced feature.", @@ -512,12 +512,12 @@ class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -531,7 +531,7 @@ class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): style_preset: str, seed: int, negative_prompt: str = "", - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read() @@ -591,37 +591,37 @@ class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): image_data = base64.b64decode(response_poll.result) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return comfy_io.NodeOutput(returned_image) + return IO.NodeOutput(returned_image) -class StabilityUpscaleFastNode(comfy_io.ComfyNode): +class StabilityUpscaleFastNode(IO.ComfyNode): """ Quickly upscales an image via Stability API call to 4x its original size; intended for upscaling low-quality/compressed images. """ @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityUpscaleFastNode", display_name="Stability AI Upscale Fast", category="api node/image/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("image"), + IO.Image.Input("image"), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @classmethod - async def execute(cls, image: torch.Tensor) -> comfy_io.NodeOutput: + async def execute(cls, image: torch.Tensor) -> IO.NodeOutput: image_binary = tensor_to_bytesio(image, total_pixels=4096*4096).read() files = { @@ -653,26 +653,26 @@ class StabilityUpscaleFastNode(comfy_io.ComfyNode): image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return comfy_io.NodeOutput(returned_image) + return IO.NodeOutput(returned_image) -class StabilityTextToAudio(comfy_io.ComfyNode): +class StabilityTextToAudio(IO.ComfyNode): """Generates high-quality music and sound effects from text descriptions.""" @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityTextToAudio", display_name="Stability AI Text To Audio", category="api node/audio/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["stable-audio-2.5"], ), - comfy_io.String.Input("prompt", multiline=True, default=""), - comfy_io.Int.Input( + IO.String.Input("prompt", multiline=True, default=""), + IO.Int.Input( "duration", default=190, min=1, @@ -681,18 +681,18 @@ class StabilityTextToAudio(comfy_io.ComfyNode): tooltip="Controls the duration in seconds of the generated audio.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for generation.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=8, min=4, @@ -703,18 +703,18 @@ class StabilityTextToAudio(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Audio.Output(), + IO.Audio.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @classmethod - async def execute(cls, model: str, prompt: str, duration: int, seed: int, steps: int) -> comfy_io.NodeOutput: + async def execute(cls, model: str, prompt: str, duration: int, seed: int, steps: int) -> IO.NodeOutput: validate_string(prompt, max_length=10000) payload = StabilityTextToAudioRequest(prompt=prompt, model=model, duration=duration, seed=seed, steps=steps) operation = SynchronousOperation( @@ -734,27 +734,27 @@ class StabilityTextToAudio(comfy_io.ComfyNode): response_api = await operation.execute() if not response_api.audio: raise ValueError("No audio file was received in response.") - return comfy_io.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) + return IO.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) -class StabilityAudioToAudio(comfy_io.ComfyNode): +class StabilityAudioToAudio(IO.ComfyNode): """Transforms existing audio samples into new high-quality compositions using text instructions.""" @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityAudioToAudio", display_name="Stability AI Audio To Audio", category="api node/audio/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["stable-audio-2.5"], ), - comfy_io.String.Input("prompt", multiline=True, default=""), - comfy_io.Audio.Input("audio", tooltip="Audio must be between 6 and 190 seconds long."), - comfy_io.Int.Input( + IO.String.Input("prompt", multiline=True, default=""), + IO.Audio.Input("audio", tooltip="Audio must be between 6 and 190 seconds long."), + IO.Int.Input( "duration", default=190, min=1, @@ -763,18 +763,18 @@ class StabilityAudioToAudio(comfy_io.ComfyNode): tooltip="Controls the duration in seconds of the generated audio.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for generation.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=8, min=4, @@ -783,24 +783,24 @@ class StabilityAudioToAudio(comfy_io.ComfyNode): tooltip="Controls the number of sampling steps.", optional=True, ), - comfy_io.Float.Input( + IO.Float.Input( "strength", default=1, min=0.01, max=1.0, step=0.01, - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, tooltip="Parameter controls how much influence the audio parameter has on the generated audio.", optional=True, ), ], outputs=[ - comfy_io.Audio.Output(), + IO.Audio.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -808,7 +808,7 @@ class StabilityAudioToAudio(comfy_io.ComfyNode): @classmethod async def execute( cls, model: str, prompt: str, audio: Input.Audio, duration: int, seed: int, steps: int, strength: float - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, max_length=10000) validate_audio_duration(audio, 6, 190) payload = StabilityAudioToAudioRequest( @@ -832,27 +832,27 @@ class StabilityAudioToAudio(comfy_io.ComfyNode): response_api = await operation.execute() if not response_api.audio: raise ValueError("No audio file was received in response.") - return comfy_io.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) + return IO.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) -class StabilityAudioInpaint(comfy_io.ComfyNode): +class StabilityAudioInpaint(IO.ComfyNode): """Transforms part of existing audio sample using text instructions.""" @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityAudioInpaint", display_name="Stability AI Audio Inpaint", category="api node/audio/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["stable-audio-2.5"], ), - comfy_io.String.Input("prompt", multiline=True, default=""), - comfy_io.Audio.Input("audio", tooltip="Audio must be between 6 and 190 seconds long."), - comfy_io.Int.Input( + IO.String.Input("prompt", multiline=True, default=""), + IO.Audio.Input("audio", tooltip="Audio must be between 6 and 190 seconds long."), + IO.Int.Input( "duration", default=190, min=1, @@ -861,18 +861,18 @@ class StabilityAudioInpaint(comfy_io.ComfyNode): tooltip="Controls the duration in seconds of the generated audio.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for generation.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=8, min=4, @@ -881,7 +881,7 @@ class StabilityAudioInpaint(comfy_io.ComfyNode): tooltip="Controls the number of sampling steps.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "mask_start", default=30, min=0, @@ -889,7 +889,7 @@ class StabilityAudioInpaint(comfy_io.ComfyNode): step=1, optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "mask_end", default=190, min=0, @@ -899,12 +899,12 @@ class StabilityAudioInpaint(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Audio.Output(), + IO.Audio.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -920,7 +920,7 @@ class StabilityAudioInpaint(comfy_io.ComfyNode): steps: int, mask_start: int, mask_end: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, max_length=10000) if mask_end <= mask_start: raise ValueError(f"Value of mask_end({mask_end}) should be greater then mask_start({mask_start})") @@ -953,12 +953,12 @@ class StabilityAudioInpaint(comfy_io.ComfyNode): response_api = await operation.execute() if not response_api.audio: raise ValueError("No audio file was received in response.") - return comfy_io.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) + return IO.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) class StabilityExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ StabilityStableImageUltraNode, StabilityStableImageSD_3_5Node, diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index 9d5eced1e..4588a7991 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -6,7 +6,7 @@ from io import BytesIO from typing import Optional from typing_extensions import override -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, IO from comfy_api.input_impl.video_types import VideoFromFile from comfy_api_nodes.apis import ( VeoGenVidRequest, @@ -51,7 +51,7 @@ def get_video_url_from_response(poll_response: VeoGenVidPollResponse) -> Optiona return None -class VeoVideoGenerationNode(comfy_io.ComfyNode): +class VeoVideoGenerationNode(IO.ComfyNode): """ Generates videos from text prompts using Google's Veo API. @@ -61,71 +61,71 @@ class VeoVideoGenerationNode(comfy_io.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="VeoVideoGenerationNode", display_name="Google Veo 2 Video Generation", category="api node/video/Veo", description="Generates videos from text prompts using Google's Veo 2 API", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text description of the video", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=["16:9", "9:16"], default="16:9", tooltip="Aspect ratio of the output video", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Negative text prompt to guide what to avoid in the video", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "duration_seconds", default=5, min=5, max=8, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Duration of the output video in seconds", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "enhance_prompt", default=True, tooltip="Whether to enhance the prompt with AI assistance", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "person_generation", options=["ALLOW", "BLOCK"], default="ALLOW", tooltip="Whether to allow generating people in the video", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=0xFFFFFFFF, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed for video generation (0 for random)", optional=True, ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="Optional reference image to guide video generation", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["veo-2.0-generate-001"], default="veo-2.0-generate-001", @@ -134,12 +134,12 @@ class VeoVideoGenerationNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -302,7 +302,7 @@ class VeoVideoGenerationNode(comfy_io.ComfyNode): video_io = BytesIO(video_data) # Return VideoFromFile object - return comfy_io.NodeOutput(VideoFromFile(video_io)) + return IO.NodeOutput(VideoFromFile(video_io)) class Veo3VideoGenerationNode(VeoVideoGenerationNode): @@ -319,78 +319,78 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="Veo3VideoGenerationNode", display_name="Google Veo 3 Video Generation", category="api node/video/Veo", description="Generates videos from text prompts using Google's Veo 3 API", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text description of the video", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=["16:9", "9:16"], default="16:9", tooltip="Aspect ratio of the output video", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Negative text prompt to guide what to avoid in the video", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "duration_seconds", default=8, min=8, max=8, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Duration of the output video in seconds (Veo 3 only supports 8 seconds)", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "enhance_prompt", default=True, tooltip="Whether to enhance the prompt with AI assistance", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "person_generation", options=["ALLOW", "BLOCK"], default="ALLOW", tooltip="Whether to allow generating people in the video", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=0xFFFFFFFF, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed for video generation (0 for random)", optional=True, ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="Optional reference image to guide video generation", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["veo-3.0-generate-001", "veo-3.0-fast-generate-001"], default="veo-3.0-generate-001", tooltip="Veo 3 model to use for video generation", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "generate_audio", default=False, tooltip="Generate audio for the video. Supported by all Veo 3 models.", @@ -398,12 +398,12 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -411,7 +411,7 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): class VeoExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ VeoVideoGenerationNode, Veo3VideoGenerationNode, diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py index ac28b683c..639be4b2b 100644 --- a/comfy_api_nodes/nodes_vidu.py +++ b/comfy_api_nodes/nodes_vidu.py @@ -6,7 +6,7 @@ from typing_extensions import override import torch from pydantic import BaseModel, Field -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import ComfyExtension, IO from comfy_api_nodes.util.validation_utils import ( validate_aspect_ratio_closeness, validate_image_dimensions, @@ -161,63 +161,63 @@ async def execute_task( ) -class ViduTextToVideoNode(comfy_io.ComfyNode): +class ViduTextToVideoNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ViduTextToVideoNode", display_name="Vidu Text To Video Generation", category="api node/video/Vidu", description="Generate video from text prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=VideoModelName, default=VideoModelName.vidu_q1, tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="A textual description for video generation", ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=5, max=5, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Duration of the output video in seconds", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed for video generation (0 for random)", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=AspectRatio, default=AspectRatio.r_16_9, tooltip="The aspect ratio of the output video", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=Resolution, default=Resolution.r_1080p, tooltip="Supported values may vary by model & duration", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "movement_amplitude", options=MovementAmplitude, default=MovementAmplitude.auto, @@ -226,12 +226,12 @@ class ViduTextToVideoNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -246,7 +246,7 @@ class ViduTextToVideoNode(comfy_io.ComfyNode): aspect_ratio: str, resolution: str, movement_amplitude: str, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: if not prompt: raise ValueError("The prompt field is required and cannot be empty.") payload = TaskCreationRequest( @@ -263,65 +263,65 @@ class ViduTextToVideoNode(comfy_io.ComfyNode): "comfy_api_key": cls.hidden.api_key_comfy_org, } results = await execute_task(VIDU_TEXT_TO_VIDEO, auth, payload, 320, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + return IO.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) -class ViduImageToVideoNode(comfy_io.ComfyNode): +class ViduImageToVideoNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ViduImageToVideoNode", display_name="Vidu Image To Video Generation", category="api node/video/Vidu", description="Generate video from image and optional prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=VideoModelName, default=VideoModelName.vidu_q1, tooltip="Model name", ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="An image to be used as the start frame of the generated video", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="A textual description for video generation", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=5, max=5, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Duration of the output video in seconds", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed for video generation (0 for random)", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=Resolution, default=Resolution.r_1080p, tooltip="Supported values may vary by model & duration", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "movement_amplitude", options=MovementAmplitude, default=MovementAmplitude.auto.value, @@ -330,12 +330,12 @@ class ViduImageToVideoNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -350,7 +350,7 @@ class ViduImageToVideoNode(comfy_io.ComfyNode): seed: int, resolution: str, movement_amplitude: str, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: if get_number_of_images(image) > 1: raise ValueError("Only one input image is allowed.") validate_image_aspect_ratio_range(image, (1, 4), (4, 1)) @@ -373,70 +373,70 @@ class ViduImageToVideoNode(comfy_io.ComfyNode): auth_kwargs=auth, ) results = await execute_task(VIDU_IMAGE_TO_VIDEO, auth, payload, 120, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + return IO.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) -class ViduReferenceVideoNode(comfy_io.ComfyNode): +class ViduReferenceVideoNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ViduReferenceVideoNode", display_name="Vidu Reference To Video Generation", category="api node/video/Vidu", description="Generate video from multiple images and prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=VideoModelName, default=VideoModelName.vidu_q1, tooltip="Model name", ), - comfy_io.Image.Input( + IO.Image.Input( "images", tooltip="Images to use as references to generate a video with consistent subjects (max 7 images).", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="A textual description for video generation", ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=5, max=5, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Duration of the output video in seconds", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed for video generation (0 for random)", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=AspectRatio, default=AspectRatio.r_16_9, tooltip="The aspect ratio of the output video", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=[model.value for model in Resolution], default=Resolution.r_1080p.value, tooltip="Supported values may vary by model & duration", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "movement_amplitude", options=[model.value for model in MovementAmplitude], default=MovementAmplitude.auto.value, @@ -445,12 +445,12 @@ class ViduReferenceVideoNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -466,7 +466,7 @@ class ViduReferenceVideoNode(comfy_io.ComfyNode): aspect_ratio: str, resolution: str, movement_amplitude: str, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: if not prompt: raise ValueError("The prompt field is required and cannot be empty.") a = get_number_of_images(images) @@ -495,68 +495,68 @@ class ViduReferenceVideoNode(comfy_io.ComfyNode): auth_kwargs=auth, ) results = await execute_task(VIDU_REFERENCE_VIDEO, auth, payload, 120, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + return IO.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) -class ViduStartEndToVideoNode(comfy_io.ComfyNode): +class ViduStartEndToVideoNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ViduStartEndToVideoNode", display_name="Vidu Start End To Video Generation", category="api node/video/Vidu", description="Generate a video from start and end frames and a prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=[model.value for model in VideoModelName], default=VideoModelName.vidu_q1.value, tooltip="Model name", ), - comfy_io.Image.Input( + IO.Image.Input( "first_frame", tooltip="Start frame", ), - comfy_io.Image.Input( + IO.Image.Input( "end_frame", tooltip="End frame", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="A textual description for video generation", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=5, max=5, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Duration of the output video in seconds", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed for video generation (0 for random)", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=[model.value for model in Resolution], default=Resolution.r_1080p.value, tooltip="Supported values may vary by model & duration", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "movement_amplitude", options=[model.value for model in MovementAmplitude], default=MovementAmplitude.auto.value, @@ -565,12 +565,12 @@ class ViduStartEndToVideoNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -586,7 +586,7 @@ class ViduStartEndToVideoNode(comfy_io.ComfyNode): seed: int, resolution: str, movement_amplitude: str, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_aspect_ratio_closeness(first_frame, end_frame, min_rel=0.8, max_rel=1.25, strict=False) payload = TaskCreationRequest( model_name=model, @@ -605,12 +605,12 @@ class ViduStartEndToVideoNode(comfy_io.ComfyNode): for frame in (first_frame, end_frame) ] results = await execute_task(VIDU_START_END_VIDEO, auth, payload, 96, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + return IO.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) class ViduExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ ViduTextToVideoNode, ViduImageToVideoNode, diff --git a/comfy_api_nodes/nodes_wan.py b/comfy_api_nodes/nodes_wan.py index 0be5daadb..b089bd907 100644 --- a/comfy_api_nodes/nodes_wan.py +++ b/comfy_api_nodes/nodes_wan.py @@ -4,7 +4,7 @@ from typing_extensions import override import torch from pydantic import BaseModel, Field -from comfy_api.latest import ComfyExtension, Input, io as comfy_io +from comfy_api.latest import ComfyExtension, Input, IO from comfy_api_nodes.apis.client import ( ApiEndpoint, HttpMethod, @@ -195,35 +195,35 @@ async def process_task( ).execute() -class WanTextToImageApi(comfy_io.ComfyNode): +class WanTextToImageApi(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="WanTextToImageApi", display_name="Wan Text to Image", category="api node/image/Wan", description="Generates image based on text prompt.", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["wan2.5-t2i-preview"], default="wan2.5-t2i-preview", tooltip="Model to use.", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Negative text prompt to guide what to avoid.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "width", default=1024, min=768, @@ -231,7 +231,7 @@ class WanTextToImageApi(comfy_io.ComfyNode): step=32, optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "height", default=1024, min=768, @@ -239,24 +239,24 @@ class WanTextToImageApi(comfy_io.ComfyNode): step=32, optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_extend", default=True, tooltip="Whether to enhance the prompt with AI assistance.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, tooltip="Whether to add an \"AI generated\" watermark to the result.", @@ -264,12 +264,12 @@ class WanTextToImageApi(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -309,36 +309,36 @@ class WanTextToImageApi(comfy_io.ComfyNode): estimated_duration=9, poll_interval=3, ) - return comfy_io.NodeOutput(await download_url_to_image_tensor(str(response.output.results[0].url))) + return IO.NodeOutput(await download_url_to_image_tensor(str(response.output.results[0].url))) -class WanImageToImageApi(comfy_io.ComfyNode): +class WanImageToImageApi(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="WanImageToImageApi", display_name="Wan Image to Image", category="api node/image/Wan", description="Generates an image from one or two input images and a text prompt. " "The output image is currently fixed at 1.6 MP; its aspect ratio matches the input image(s).", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["wan2.5-i2i-preview"], default="wan2.5-i2i-preview", tooltip="Model to use.", ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="Single-image editing or multi-image fusion, maximum 2 images.", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", @@ -346,7 +346,7 @@ class WanImageToImageApi(comfy_io.ComfyNode): optional=True, ), # redo this later as an optional combo of recommended resolutions - # comfy_io.Int.Input( + # IO.Int.Input( # "width", # default=1280, # min=384, @@ -354,7 +354,7 @@ class WanImageToImageApi(comfy_io.ComfyNode): # step=16, # optional=True, # ), - # comfy_io.Int.Input( + # IO.Int.Input( # "height", # default=1280, # min=384, @@ -362,18 +362,18 @@ class WanImageToImageApi(comfy_io.ComfyNode): # step=16, # optional=True, # ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, tooltip="Whether to add an \"AI generated\" watermark to the result.", @@ -381,12 +381,12 @@ class WanImageToImageApi(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -431,38 +431,38 @@ class WanImageToImageApi(comfy_io.ComfyNode): estimated_duration=42, poll_interval=3, ) - return comfy_io.NodeOutput(await download_url_to_image_tensor(str(response.output.results[0].url))) + return IO.NodeOutput(await download_url_to_image_tensor(str(response.output.results[0].url))) -class WanTextToVideoApi(comfy_io.ComfyNode): +class WanTextToVideoApi(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="WanTextToVideoApi", display_name="Wan Text to Video", category="api node/video/Wan", description="Generates video based on text prompt.", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["wan2.5-t2v-preview"], default="wan2.5-t2v-preview", tooltip="Model to use.", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Negative text prompt to guide what to avoid.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "size", options=[ "480p: 1:1 (624x624)", @@ -482,45 +482,45 @@ class WanTextToVideoApi(comfy_io.ComfyNode): default="480p: 1:1 (624x624)", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=5, max=10, step=5, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Available durations: 5 and 10 seconds", optional=True, ), - comfy_io.Audio.Input( + IO.Audio.Input( "audio", optional=True, tooltip="Audio must contain a clear, loud voice, without extraneous noise, background music.", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "generate_audio", default=False, optional=True, tooltip="If there is no audio input, generate audio automatically.", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_extend", default=True, tooltip="Whether to enhance the prompt with AI assistance.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, tooltip="Whether to add an \"AI generated\" watermark to the result.", @@ -528,12 +528,12 @@ class WanTextToVideoApi(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -582,41 +582,41 @@ class WanTextToVideoApi(comfy_io.ComfyNode): estimated_duration=120 * int(duration / 5), poll_interval=6, ) - return comfy_io.NodeOutput(await download_url_to_video_output(response.output.video_url)) + return IO.NodeOutput(await download_url_to_video_output(response.output.video_url)) -class WanImageToVideoApi(comfy_io.ComfyNode): +class WanImageToVideoApi(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="WanImageToVideoApi", display_name="Wan Image to Video", category="api node/video/Wan", description="Generates video based on the first frame and text prompt.", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["wan2.5-i2v-preview"], default="wan2.5-i2v-preview", tooltip="Model to use.", ), - comfy_io.Image.Input( + IO.Image.Input( "image", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Negative text prompt to guide what to avoid.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=[ "480P", @@ -626,45 +626,45 @@ class WanImageToVideoApi(comfy_io.ComfyNode): default="480P", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=5, max=10, step=5, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Available durations: 5 and 10 seconds", optional=True, ), - comfy_io.Audio.Input( + IO.Audio.Input( "audio", optional=True, tooltip="Audio must contain a clear, loud voice, without extraneous noise, background music.", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "generate_audio", default=False, optional=True, tooltip="If there is no audio input, generate audio automatically.", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_extend", default=True, tooltip="Whether to enhance the prompt with AI assistance.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, tooltip="Whether to add an \"AI generated\" watermark to the result.", @@ -672,12 +672,12 @@ class WanImageToVideoApi(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -731,12 +731,12 @@ class WanImageToVideoApi(comfy_io.ComfyNode): estimated_duration=120 * int(duration / 5), poll_interval=6, ) - return comfy_io.NodeOutput(await download_url_to_video_output(response.output.video_url)) + return IO.NodeOutput(await download_url_to_video_output(response.output.video_url)) class WanApiExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ WanTextToImageApi, WanImageToImageApi, From ddfce1af4fc76768dbdd0cc4fa22d47b20a8b876 Mon Sep 17 00:00:00 2001 From: Arjan Singh <1598641+arjansingh@users.noreply.github.com> Date: Tue, 14 Oct 2025 18:08:23 -0700 Subject: [PATCH 452/759] Bump frontend to 1.28.6 (#10345) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bbb22364f..a45057970 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.27.10 +comfyui-frontend-package==1.28.6 comfyui-workflow-templates==0.1.95 comfyui-embedded-docs==0.3.0 torch From 1c10b33f9bbc75114053bc041851b60767791783 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 14 Oct 2025 21:21:11 -0700 Subject: [PATCH 453/759] gfx942 doesn't support fp8 operations. (#10348) --- comfy/model_management.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 709ebc40b..d82d5b8b0 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -349,7 +349,7 @@ try: if any((a in arch) for a in ["gfx1201"]): ENABLE_PYTORCH_ATTENTION = True if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4): - if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx942", "gfx950"]): # TODO: more arches + if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx950"]): # TODO: more arches, "gfx942" gives error on pytorch nightly 2.10 1013 rocm7.0 SUPPORT_FP8_OPS = True except: From f72c6616b2e91e4021591895192cef8b9d4d1c75 Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Thu, 16 Oct 2025 06:12:25 +0800 Subject: [PATCH 454/759] Add TemporalScoreRescaling node (#10351) * Add TemporalScoreRescaling node * Mention image generation in tsr_k's tooltip --- comfy_extras/nodes_eps.py | 95 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/comfy_extras/nodes_eps.py b/comfy_extras/nodes_eps.py index 7852d85e5..4d8061741 100644 --- a/comfy_extras/nodes_eps.py +++ b/comfy_extras/nodes_eps.py @@ -1,5 +1,7 @@ +import torch from typing_extensions import override +from comfy.k_diffusion.sampling import sigma_to_half_log_snr from comfy_api.latest import ComfyExtension, io @@ -63,12 +65,105 @@ class EpsilonScaling(io.ComfyNode): return io.NodeOutput(model_clone) +def compute_tsr_rescaling_factor( + snr: torch.Tensor, tsr_k: float, tsr_variance: float +) -> torch.Tensor: + """Compute the rescaling score ratio in Temporal Score Rescaling. + + See equation (6) in https://arxiv.org/pdf/2510.01184v1. + """ + posinf_mask = torch.isposinf(snr) + rescaling_factor = (snr * tsr_variance + 1) / (snr * tsr_variance / tsr_k + 1) + return torch.where(posinf_mask, tsr_k, rescaling_factor) # when snr → inf, r = tsr_k + + +class TemporalScoreRescaling(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="TemporalScoreRescaling", + display_name="TSR - Temporal Score Rescaling", + category="model_patches/unet", + inputs=[ + io.Model.Input("model"), + io.Float.Input( + "tsr_k", + tooltip=( + "Controls the rescaling strength.\n" + "Lower k produces more detailed results; higher k produces smoother results in image generation. Setting k = 1 disables rescaling." + ), + default=0.95, + min=0.01, + max=100.0, + step=0.001, + display_mode=io.NumberDisplay.number, + ), + io.Float.Input( + "tsr_sigma", + tooltip=( + "Controls how early rescaling takes effect.\n" + "Larger values take effect earlier." + ), + default=1.0, + min=0.01, + max=100.0, + step=0.001, + display_mode=io.NumberDisplay.number, + ), + ], + outputs=[ + io.Model.Output( + display_name="patched_model", + ), + ], + description=( + "[Post-CFG Function]\n" + "TSR - Temporal Score Rescaling (2510.01184)\n\n" + "Rescaling the model's score or noise to steer the sampling diversity.\n" + ), + ) + + @classmethod + def execute(cls, model, tsr_k, tsr_sigma) -> io.NodeOutput: + tsr_variance = tsr_sigma**2 + + def temporal_score_rescaling(args): + denoised = args["denoised"] + x = args["input"] + sigma = args["sigma"] + curr_model = args["model"] + + # No rescaling (r = 1) or no noise + if tsr_k == 1 or sigma == 0: + return denoised + + model_sampling = curr_model.current_patcher.get_model_object("model_sampling") + half_log_snr = sigma_to_half_log_snr(sigma, model_sampling) + snr = (2 * half_log_snr).exp() + + # No rescaling needed (r = 1) + if snr == 0: + return denoised + + rescaling_r = compute_tsr_rescaling_factor(snr, tsr_k, tsr_variance) + + # Derived from scaled_denoised = (x - r * sigma * noise) / alpha + alpha = sigma * half_log_snr.exp() + return torch.lerp(x / alpha, denoised, rescaling_r) + + m = model.clone() + m.set_model_sampler_post_cfg_function(temporal_score_rescaling) + return io.NodeOutput(m) + + class EpsilonScalingExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: return [ EpsilonScaling, + TemporalScoreRescaling, ] + async def comfy_entrypoint() -> EpsilonScalingExtension: return EpsilonScalingExtension() From 74b7f0b04ba19926286518b0a0179290b79bfae0 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 16 Oct 2025 01:41:45 +0300 Subject: [PATCH 455/759] feat(api-nodes): add Veo3.1 model (#10357) --- comfy_api_nodes/nodes_veo2.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index 4588a7991..4ab5c5186 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -27,6 +27,13 @@ from comfy_api_nodes.apinode_utils import ( ) AVERAGE_DURATION_VIDEO_GEN = 32 +MODELS_MAP = { + "veo-2.0-generate-001": "veo-2.0-generate-001", + "veo-3.1-generate": "veo-3.1-generate-preview", + "veo-3.1-fast-generate": "veo-3.1-fast-generate-preview", + "veo-3.0-generate-001": "veo-3.0-generate-001", + "veo-3.0-fast-generate-001": "veo-3.0-fast-generate-001", +} def convert_image_to_base64(image: torch.Tensor): if image is None: @@ -158,6 +165,7 @@ class VeoVideoGenerationNode(IO.ComfyNode): model="veo-2.0-generate-001", generate_audio=False, ): + model = MODELS_MAP[model] # Prepare the instances for the request instances = [] @@ -385,7 +393,7 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): ), IO.Combo.Input( "model", - options=["veo-3.0-generate-001", "veo-3.0-fast-generate-001"], + options=list(MODELS_MAP.keys()), default="veo-3.0-generate-001", tooltip="Veo 3 model to use for video generation", optional=True, From 6b035bfce25b5336ed2a39c72972a8a36a80f9bd Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 15 Oct 2025 15:48:12 -0700 Subject: [PATCH 456/759] Latest pytorch stable is cu130 (#10361) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index db1fdaf3c..b0731db33 100644 --- a/README.md +++ b/README.md @@ -255,7 +255,7 @@ This is the command to install the Pytorch xpu nightly which might have some per Nvidia users should install stable pytorch using this command: -```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129``` +```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu130``` This is the command to install pytorch nightly instead which might have performance improvements. From 493b81e48f4067da95e4cee36d42a3516338da79 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Wed, 15 Oct 2025 16:47:26 -0700 Subject: [PATCH 457/759] Fix order of inputs nested merge_nested_dicts (#10362) --- comfy/patcher_extension.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/patcher_extension.py b/comfy/patcher_extension.py index 46cc7b2a8..5ee4d5ee5 100644 --- a/comfy/patcher_extension.py +++ b/comfy/patcher_extension.py @@ -150,7 +150,7 @@ def merge_nested_dicts(dict1: dict, dict2: dict, copy_dict1=True): for key, value in dict2.items(): if isinstance(value, dict): curr_value = merged_dict.setdefault(key, {}) - merged_dict[key] = merge_nested_dicts(value, curr_value) + merged_dict[key] = merge_nested_dicts(curr_value, value) elif isinstance(value, list): merged_dict.setdefault(key, []).extend(value) else: From afa8a24fe1f81d447b961fdf41f47f9094d28919 Mon Sep 17 00:00:00 2001 From: Faych <90372299+neverbiasu@users.noreply.github.com> Date: Thu, 16 Oct 2025 01:16:09 +0100 Subject: [PATCH 458/759] refactor: Replace manual patches merging with merge_nested_dicts (#10360) --- comfy/samplers.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/comfy/samplers.py b/comfy/samplers.py index c59e296a1..e7efaf470 100755 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -306,17 +306,10 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens copy_dict1=False) if patches is not None: - # TODO: replace with merge_nested_dicts function - if "patches" in transformer_options: - cur_patches = transformer_options["patches"].copy() - for p in patches: - if p in cur_patches: - cur_patches[p] = cur_patches[p] + patches[p] - else: - cur_patches[p] = patches[p] - transformer_options["patches"] = cur_patches - else: - transformer_options["patches"] = patches + transformer_options["patches"] = comfy.patcher_extension.merge_nested_dicts( + transformer_options.get("patches", {}), + patches + ) transformer_options["cond_or_uncond"] = cond_or_uncond[:] transformer_options["uuids"] = uuids[:] From 55ac7d333c55d808be33c590a4a2e6c965d5f9a8 Mon Sep 17 00:00:00 2001 From: Arjan Singh <1598641+arjansingh@users.noreply.github.com> Date: Wed, 15 Oct 2025 20:30:39 -0700 Subject: [PATCH 459/759] Bump frontend to 1.28.7 (#10364) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a45057970..82457df54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.28.6 +comfyui-frontend-package==1.28.7 comfyui-workflow-templates==0.1.95 comfyui-embedded-docs==0.3.0 torch From 4054b4bf38d11fc0c784c2d19f5fc0ed3bbc7ae4 Mon Sep 17 00:00:00 2001 From: Rizumu Ayaka Date: Thu, 16 Oct 2025 16:13:31 +0800 Subject: [PATCH 460/759] feat: deprecated API alert (#10366) --- server.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/server.py b/server.py index 80e9d3fa7..a44f4f237 100644 --- a/server.py +++ b/server.py @@ -48,6 +48,28 @@ async def send_socket_catch_exception(function, message): except (aiohttp.ClientError, aiohttp.ClientPayloadError, ConnectionResetError, BrokenPipeError, ConnectionError) as err: logging.warning("send error: {}".format(err)) +# Track deprecated paths that have been warned about to only warn once per file +_deprecated_paths_warned = set() + +@web.middleware +async def deprecation_warning(request: web.Request, handler): + """Middleware to warn about deprecated frontend API paths""" + path = request.path + + if (path.startswith('/scripts/') or path.startswith('/extensions/core/')): + # Only warn once per unique file path + if path not in _deprecated_paths_warned: + _deprecated_paths_warned.add(path) + logging.warning( + f"[DEPRECATION WARNING] Detected import of deprecated legacy API: {path}. " + f"This is likely caused by a custom node extension using outdated APIs. " + f"Please update your extensions or contact the extension author for an updated version." + ) + + response: web.Response = await handler(request) + return response + + @web.middleware async def compress_body(request: web.Request, handler): accept_encoding = request.headers.get("Accept-Encoding", "") @@ -159,7 +181,7 @@ class PromptServer(): self.client_session:Optional[aiohttp.ClientSession] = None self.number = 0 - middlewares = [cache_control] + middlewares = [cache_control, deprecation_warning] if args.enable_compress_response_body: middlewares.append(compress_body) From bc0ad9bb49b642e081f99f92d239d634988d52bc Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 16 Oct 2025 20:12:50 +0300 Subject: [PATCH 461/759] fix(api-nodes): remove "veo2" model from Veo3 node (#10372) --- comfy_api_nodes/nodes_veo2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index 4ab5c5186..daeaa823e 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -393,7 +393,9 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): ), IO.Combo.Input( "model", - options=list(MODELS_MAP.keys()), + options=[ + "veo-3.1-generate", "veo-3.1-fast-generate", "veo-3.0-generate-001", "veo-3.0-fast-generate-001" + ], default="veo-3.0-generate-001", tooltip="Veo 3 model to use for video generation", optional=True, From 19b466160c1cd43f707769adef6f8ed6e9fd50bf Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 16 Oct 2025 15:16:03 -0700 Subject: [PATCH 462/759] Workaround for nvidia issue where VAE uses 3x more memory on torch 2.9 (#10373) --- comfy/ops.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/comfy/ops.py b/comfy/ops.py index b2096b40e..893ceda98 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -52,6 +52,16 @@ try: except (ModuleNotFoundError, TypeError): logging.warning("Could not set sdpa backend priority.") +NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False +try: + if comfy.model_management.is_nvidia(): + if torch.backends.cudnn.version() >= 91300 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10): + #TODO: change upper bound version once it's fixed' + NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True + logging.info("working around nvidia conv3d memory bug.") +except: + pass + cast_to = comfy.model_management.cast_to #TODO: remove once no more references if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast: @@ -151,6 +161,15 @@ class disable_weight_init: def reset_parameters(self): return None + def _conv_forward(self, input, weight, bias, *args, **kwargs): + if NVIDIA_MEMORY_CONV_BUG_WORKAROUND and weight.dtype in (torch.float16, torch.bfloat16): + out = torch.cudnn_convolution(input, weight, self.padding, self.stride, self.dilation, self.groups, benchmark=False, deterministic=False, allow_tf32=True) + if bias is not None: + out += bias.reshape((1, -1) + (1,) * (out.ndim - 2)) + return out + else: + return super()._conv_forward(input, weight, bias, *args, **kwargs) + def forward_comfy_cast_weights(self, input): weight, bias = cast_bias_weight(self, input) return self._conv_forward(input, weight, bias) From b1293d50eff5f1ff2e54f73114fbe7c0f9aef8fe Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 16 Oct 2025 16:59:56 -0700 Subject: [PATCH 463/759] workaround also works on cudnn 91200 (#10375) --- comfy/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ops.py b/comfy/ops.py index 893ceda98..56b07b44c 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -55,7 +55,7 @@ except (ModuleNotFoundError, TypeError): NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False try: if comfy.model_management.is_nvidia(): - if torch.backends.cudnn.version() >= 91300 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10): + if torch.backends.cudnn.version() >= 91200 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10): #TODO: change upper bound version once it's fixed' NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True logging.info("working around nvidia conv3d memory bug.") From d8d60b56093a15edc5d25486d387d3c5917dc3d3 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Thu, 16 Oct 2025 21:39:37 -0700 Subject: [PATCH 464/759] Do batch_slice in EasyCache's apply_cache_diff (#10376) --- comfy_extras/nodes_easycache.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py index c170e9fd9..1359e2f99 100644 --- a/comfy_extras/nodes_easycache.py +++ b/comfy_extras/nodes_easycache.py @@ -244,6 +244,8 @@ class EasyCacheHolder: self.total_steps_skipped += 1 batch_offset = x.shape[0] // len(uuids) for i, uuid in enumerate(uuids): + # slice out only what is relevant to this cond + batch_slice = [slice(i*batch_offset,(i+1)*batch_offset)] # if cached dims don't match x dims, cut off excess and hope for the best (cosmos world2video) if x.shape[1:] != self.uuid_cache_diffs[uuid].shape[1:]: if not self.allow_mismatch: @@ -261,9 +263,8 @@ class EasyCacheHolder: slicing.append(slice(None, dim_u)) else: slicing.append(slice(None)) - slicing = [slice(i*batch_offset,(i+1)*batch_offset)] + slicing - x = x[slicing] - x += self.uuid_cache_diffs[uuid].to(x.device) + batch_slice = batch_slice + slicing + x[batch_slice] += self.uuid_cache_diffs[uuid].to(x.device) return x def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]): From b1467da4803017a418c32c159525767f45871ca3 Mon Sep 17 00:00:00 2001 From: rattus128 <46076784+rattus128@users.noreply.github.com> Date: Sat, 18 Oct 2025 06:55:15 +1000 Subject: [PATCH 465/759] execution: fold in dependency aware caching / Fix --cache-none with loops/lazy etc (#10368) * execution: fold in dependency aware caching This makes --cache-none compatiable with lazy and expanded subgraphs. Currently the --cache-none option is powered by the DependencyAwareCache. The cache attempts to maintain a parallel copy of the execution list data structure, however it is only setup once at the start of execution and does not get meaninigful updates to the execution list. This causes multiple problems when --cache-none is used with lazy and expanded subgraphs as the DAC does not accurately update its copy of the execution data structure. DAC has an attempt to handle subgraphs ensure_subcache however this does not accurately connect to nodes outside the subgraph. The current semantics of DAC are to free a node ASAP after the dependent nodes are executed. This means that if a subgraph refs such a node it will be requed and re-executed by the execution_list but DAC wont see it in its to-free lists anymore and leak memory. Rather than try and cover all the cases where the execution list changes from inside the cache, move the while problem to the executor which maintains an always up-to-date copy of the wanted data-structure. The executor now has a fast-moving run-local cache of its own. Each _to node has its own mini cache, and the cache is unconditionally primed at the time of add_strong_link. add_strong_link is called for all of static workflows, lazy links and expanded subgraphs so its the singular source of truth for output dependendencies. In the case of a cache-hit, the executor cache will hold the non-none value (it will respect updates if they happen somehow as well). In the case of a cache-miss, the executor caches a None and will wait for a notification to update the value when the node completes. When a node completes execution, it simply releases its mini-cache and in turn its strong refs on its direct anscestor outputs, allowing for ASAP freeing (same as the DependencyAwareCache but a little more automatic). This now allows for re-implementation of --cache-none with no cache at all. The dependency aware cache was also observing the dependency sematics for the objects and UI cache which is not accurate (this entire logic was always outputs specific). This also prepares for more complex caching strategies (such as RAM pressure based caching), where a cache can implement any freeing strategy completely independently of the DepedancyAwareness requirement. * main: re-implement --cache-none as no cache at all The execution list now tracks the dependency aware caching more correctly that the DependancyAwareCache. Change it to a cache that does nothing. * test_execution: add --cache-none to the test suite --cache-none is now expected to work universally. Run it through the full unit test suite. Propagate the server parameterization for whether or not the server is capabale of caching, so that the minority of tests that specifically check for cache hits can if else. Hard assert NOT caching in the else to give some coverage of --cache-none expected behaviour to not acutally cache. --- comfy_execution/caching.py | 174 ++++-------------------------- comfy_execution/graph.py | 31 +++++- execution.py | 34 +++--- main.py | 2 +- tests/execution/test_execution.py | 50 +++++---- 5 files changed, 101 insertions(+), 190 deletions(-) diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py index 41224ce3b..566bc3f9c 100644 --- a/comfy_execution/caching.py +++ b/comfy_execution/caching.py @@ -265,6 +265,26 @@ class HierarchicalCache(BasicCache): assert cache is not None return await cache._ensure_subcache(node_id, children_ids) +class NullCache: + + async def set_prompt(self, dynprompt, node_ids, is_changed_cache): + pass + + def all_node_ids(self): + return [] + + def clean_unused(self): + pass + + def get(self, node_id): + return None + + def set(self, node_id, value): + pass + + async def ensure_subcache_for(self, node_id, children_ids): + return self + class LRUCache(BasicCache): def __init__(self, key_class, max_size=100): super().__init__(key_class) @@ -316,157 +336,3 @@ class LRUCache(BasicCache): self._mark_used(child_id) self.children[cache_key].append(self.cache_key_set.get_data_key(child_id)) return self - - -class DependencyAwareCache(BasicCache): - """ - A cache implementation that tracks dependencies between nodes and manages - their execution and caching accordingly. It extends the BasicCache class. - Nodes are removed from this cache once all of their descendants have been - executed. - """ - - def __init__(self, key_class): - """ - Initialize the DependencyAwareCache. - - Args: - key_class: The class used for generating cache keys. - """ - super().__init__(key_class) - self.descendants = {} # Maps node_id -> set of descendant node_ids - self.ancestors = {} # Maps node_id -> set of ancestor node_ids - self.executed_nodes = set() # Tracks nodes that have been executed - - async def set_prompt(self, dynprompt, node_ids, is_changed_cache): - """ - Clear the entire cache and rebuild the dependency graph. - - Args: - dynprompt: The dynamic prompt object containing node information. - node_ids: List of node IDs to initialize the cache for. - is_changed_cache: Flag indicating if the cache has changed. - """ - # Clear all existing cache data - self.cache.clear() - self.subcaches.clear() - self.descendants.clear() - self.ancestors.clear() - self.executed_nodes.clear() - - # Call the parent method to initialize the cache with the new prompt - await super().set_prompt(dynprompt, node_ids, is_changed_cache) - - # Rebuild the dependency graph - self._build_dependency_graph(dynprompt, node_ids) - - def _build_dependency_graph(self, dynprompt, node_ids): - """ - Build the dependency graph for all nodes. - - Args: - dynprompt: The dynamic prompt object containing node information. - node_ids: List of node IDs to build the graph for. - """ - self.descendants.clear() - self.ancestors.clear() - for node_id in node_ids: - self.descendants[node_id] = set() - self.ancestors[node_id] = set() - - for node_id in node_ids: - inputs = dynprompt.get_node(node_id)["inputs"] - for input_data in inputs.values(): - if is_link(input_data): # Check if the input is a link to another node - ancestor_id = input_data[0] - self.descendants[ancestor_id].add(node_id) - self.ancestors[node_id].add(ancestor_id) - - def set(self, node_id, value): - """ - Mark a node as executed and store its value in the cache. - - Args: - node_id: The ID of the node to store. - value: The value to store for the node. - """ - self._set_immediate(node_id, value) - self.executed_nodes.add(node_id) - self._cleanup_ancestors(node_id) - - def get(self, node_id): - """ - Retrieve the cached value for a node. - - Args: - node_id: The ID of the node to retrieve. - - Returns: - The cached value for the node. - """ - return self._get_immediate(node_id) - - async def ensure_subcache_for(self, node_id, children_ids): - """ - Ensure a subcache exists for a node and update dependencies. - - Args: - node_id: The ID of the parent node. - children_ids: List of child node IDs to associate with the parent node. - - Returns: - The subcache object for the node. - """ - subcache = await super()._ensure_subcache(node_id, children_ids) - for child_id in children_ids: - self.descendants[node_id].add(child_id) - self.ancestors[child_id].add(node_id) - return subcache - - def _cleanup_ancestors(self, node_id): - """ - Check if ancestors of a node can be removed from the cache. - - Args: - node_id: The ID of the node whose ancestors are to be checked. - """ - for ancestor_id in self.ancestors.get(node_id, []): - if ancestor_id in self.executed_nodes: - # Remove ancestor if all its descendants have been executed - if all(descendant in self.executed_nodes for descendant in self.descendants[ancestor_id]): - self._remove_node(ancestor_id) - - def _remove_node(self, node_id): - """ - Remove a node from the cache. - - Args: - node_id: The ID of the node to remove. - """ - cache_key = self.cache_key_set.get_data_key(node_id) - if cache_key in self.cache: - del self.cache[cache_key] - subcache_key = self.cache_key_set.get_subcache_key(node_id) - if subcache_key in self.subcaches: - del self.subcaches[subcache_key] - - def clean_unused(self): - """ - Clean up unused nodes. This is a no-op for this cache implementation. - """ - pass - - def recursive_debug_dump(self): - """ - Dump the cache and dependency graph for debugging. - - Returns: - A list containing the cache state and dependency graph. - """ - result = super().recursive_debug_dump() - result.append({ - "descendants": self.descendants, - "ancestors": self.ancestors, - "executed_nodes": list(self.executed_nodes), - }) - return result diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index f4b427265..d5bbacde3 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -153,8 +153,9 @@ class TopologicalSort: continue _, _, input_info = self.get_input_info(unique_id, input_name) is_lazy = input_info is not None and "lazy" in input_info and input_info["lazy"] - if (include_lazy or not is_lazy) and not self.is_cached(from_node_id): - node_ids.append(from_node_id) + if (include_lazy or not is_lazy): + if not self.is_cached(from_node_id): + node_ids.append(from_node_id) links.append((from_node_id, from_socket, unique_id)) for link in links: @@ -194,10 +195,34 @@ class ExecutionList(TopologicalSort): super().__init__(dynprompt) self.output_cache = output_cache self.staged_node_id = None + self.execution_cache = {} + self.execution_cache_listeners = {} def is_cached(self, node_id): return self.output_cache.get(node_id) is not None + def cache_link(self, from_node_id, to_node_id): + if not to_node_id in self.execution_cache: + self.execution_cache[to_node_id] = {} + self.execution_cache[to_node_id][from_node_id] = self.output_cache.get(from_node_id) + if not from_node_id in self.execution_cache_listeners: + self.execution_cache_listeners[from_node_id] = set() + self.execution_cache_listeners[from_node_id].add(to_node_id) + + def get_output_cache(self, from_node_id, to_node_id): + if not to_node_id in self.execution_cache: + return None + return self.execution_cache[to_node_id].get(from_node_id) + + def cache_update(self, node_id, value): + if node_id in self.execution_cache_listeners: + for to_node_id in self.execution_cache_listeners[node_id]: + self.execution_cache[to_node_id][node_id] = value + + def add_strong_link(self, from_node_id, from_socket, to_node_id): + super().add_strong_link(from_node_id, from_socket, to_node_id) + self.cache_link(from_node_id, to_node_id) + async def stage_node_execution(self): assert self.staged_node_id is None if self.is_empty(): @@ -277,6 +302,8 @@ class ExecutionList(TopologicalSort): def complete_node_execution(self): node_id = self.staged_node_id self.pop_node(node_id) + self.execution_cache.pop(node_id, None) + self.execution_cache_listeners.pop(node_id, None) self.staged_node_id = None def get_nodes_in_cycle(self): diff --git a/execution.py b/execution.py index 1dc35738b..78c36a4b0 100644 --- a/execution.py +++ b/execution.py @@ -18,7 +18,7 @@ from comfy_execution.caching import ( BasicCache, CacheKeySetID, CacheKeySetInputSignature, - DependencyAwareCache, + NullCache, HierarchicalCache, LRUCache, ) @@ -91,13 +91,13 @@ class IsChangedCache: class CacheType(Enum): CLASSIC = 0 LRU = 1 - DEPENDENCY_AWARE = 2 + NONE = 2 class CacheSet: def __init__(self, cache_type=None, cache_size=None): - if cache_type == CacheType.DEPENDENCY_AWARE: - self.init_dependency_aware_cache() + if cache_type == CacheType.NONE: + self.init_null_cache() logging.info("Disabling intermediate node cache.") elif cache_type == CacheType.LRU: if cache_size is None: @@ -120,11 +120,12 @@ class CacheSet: self.ui = LRUCache(CacheKeySetInputSignature, max_size=cache_size) self.objects = HierarchicalCache(CacheKeySetID) - # only hold cached items while the decendents have not executed - def init_dependency_aware_cache(self): - self.outputs = DependencyAwareCache(CacheKeySetInputSignature) - self.ui = DependencyAwareCache(CacheKeySetInputSignature) - self.objects = DependencyAwareCache(CacheKeySetID) + def init_null_cache(self): + self.outputs = NullCache() + #The UI cache is expected to be iterable at the end of each workflow + #so it must cache at least a full workflow. Use Heirachical + self.ui = HierarchicalCache(CacheKeySetInputSignature) + self.objects = NullCache() def recursive_debug_dump(self): result = { @@ -135,7 +136,7 @@ class CacheSet: SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org") -def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, extra_data={}): +def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=None, extra_data={}): is_v3 = issubclass(class_def, _ComfyNodeInternal) if is_v3: valid_inputs, schema = class_def.INPUT_TYPES(include_hidden=False, return_schema=True) @@ -153,10 +154,10 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e if is_link(input_data) and (not input_info or not input_info.get("rawLink", False)): input_unique_id = input_data[0] output_index = input_data[1] - if outputs is None: + if execution_list is None: mark_missing() continue # This might be a lazily-evaluated input - cached_output = outputs.get(input_unique_id) + cached_output = execution_list.get_output_cache(input_unique_id, unique_id) if cached_output is None: mark_missing() continue @@ -405,6 +406,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, cached_output = caches.ui.get(unique_id) or {} server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_output.get("output",None), "prompt_id": prompt_id }, server.client_id) get_progress_state().finish_progress(unique_id) + execution_list.cache_update(unique_id, caches.outputs.get(unique_id)) return (ExecutionResult.SUCCESS, None, None) input_data_all = None @@ -434,7 +436,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, for r in result: if is_link(r): source_node, source_output = r[0], r[1] - node_output = caches.outputs.get(source_node)[source_output] + node_output = execution_list.get_output_cache(source_node, unique_id)[source_output] for o in node_output: resolved_output.append(o) @@ -446,7 +448,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, has_subgraph = False else: get_progress_state().start_progress(unique_id) - input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data) + input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data) if server.client_id is not None: server.last_node_id = display_node_id server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id) @@ -549,11 +551,15 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, subcache.clean_unused() for node_id in new_output_ids: execution_list.add_node(node_id) + execution_list.cache_link(node_id, unique_id) for link in new_output_links: execution_list.add_strong_link(link[0], link[1], unique_id) pending_subgraph_results[unique_id] = cached_outputs return (ExecutionResult.PENDING, None, None) + caches.outputs.set(unique_id, output_data) + execution_list.cache_update(unique_id, output_data) + except comfy.model_management.InterruptProcessingException as iex: logging.info("Processing interrupted") diff --git a/main.py b/main.py index 35857dba8..4b4c5dcc4 100644 --- a/main.py +++ b/main.py @@ -173,7 +173,7 @@ def prompt_worker(q, server_instance): if args.cache_lru > 0: cache_type = execution.CacheType.LRU elif args.cache_none: - cache_type = execution.CacheType.DEPENDENCY_AWARE + cache_type = execution.CacheType.NONE e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_size=args.cache_lru) last_gc_collect = 0 diff --git a/tests/execution/test_execution.py b/tests/execution/test_execution.py index ef73ad9fd..ace0d2279 100644 --- a/tests/execution/test_execution.py +++ b/tests/execution/test_execution.py @@ -152,12 +152,12 @@ class TestExecution: # Initialize server and client # @fixture(scope="class", autouse=True, params=[ - # (use_lru, lru_size) - (False, 0), - (True, 0), - (True, 100), + { "extra_args" : [], "should_cache_results" : True }, + { "extra_args" : ["--cache-lru", 0], "should_cache_results" : True }, + { "extra_args" : ["--cache-lru", 100], "should_cache_results" : True }, + { "extra_args" : ["--cache-none"], "should_cache_results" : False }, ]) - def _server(self, args_pytest, request): + def server(self, args_pytest, request): # Start server pargs = [ 'python','main.py', @@ -167,12 +167,10 @@ class TestExecution: '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml', '--cpu', ] - use_lru, lru_size = request.param - if use_lru: - pargs += ['--cache-lru', str(lru_size)] + pargs += [ str(param) for param in request.param["extra_args"] ] print("Running server with args:", pargs) # noqa: T201 p = subprocess.Popen(pargs) - yield + yield request.param p.kill() torch.cuda.empty_cache() @@ -193,7 +191,7 @@ class TestExecution: return comfy_client @fixture(scope="class", autouse=True) - def shared_client(self, args_pytest, _server): + def shared_client(self, args_pytest, server): client = self.start_client(args_pytest["listen"], args_pytest["port"]) yield client del client @@ -225,7 +223,7 @@ class TestExecution: assert result.did_run(mask) assert result.did_run(lazy_mix) - def test_full_cache(self, client: ComfyClient, builder: GraphBuilder): + def test_full_cache(self, client: ComfyClient, builder: GraphBuilder, server): g = builder input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) input2 = g.node("StubImage", content="NOISE", height=512, width=512, batch_size=1) @@ -237,9 +235,12 @@ class TestExecution: client.run(g) result2 = client.run(g) for node_id, node in g.nodes.items(): - assert not result2.did_run(node), f"Node {node_id} ran, but should have been cached" + if server["should_cache_results"]: + assert not result2.did_run(node), f"Node {node_id} ran, but should have been cached" + else: + assert result2.did_run(node), f"Node {node_id} was cached, but should have been run" - def test_partial_cache(self, client: ComfyClient, builder: GraphBuilder): + def test_partial_cache(self, client: ComfyClient, builder: GraphBuilder, server): g = builder input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) input2 = g.node("StubImage", content="NOISE", height=512, width=512, batch_size=1) @@ -251,8 +252,12 @@ class TestExecution: client.run(g) mask.inputs['value'] = 0.4 result2 = client.run(g) - assert not result2.did_run(input1), "Input1 should have been cached" - assert not result2.did_run(input2), "Input2 should have been cached" + if server["should_cache_results"]: + assert not result2.did_run(input1), "Input1 should have been cached" + assert not result2.did_run(input2), "Input2 should have been cached" + else: + assert result2.did_run(input1), "Input1 should have been rerun" + assert result2.did_run(input2), "Input2 should have been rerun" def test_error(self, client: ComfyClient, builder: GraphBuilder): g = builder @@ -411,7 +416,7 @@ class TestExecution: input2 = g.node("StubImage", id="removeme", content="WHITE", height=512, width=512, batch_size=1) client.run(g) - def test_custom_is_changed(self, client: ComfyClient, builder: GraphBuilder): + def test_custom_is_changed(self, client: ComfyClient, builder: GraphBuilder, server): g = builder # Creating the nodes in this specific order previously caused a bug save = g.node("SaveImage") @@ -427,7 +432,10 @@ class TestExecution: result3 = client.run(g) result4 = client.run(g) assert result1.did_run(is_changed), "is_changed should have been run" - assert not result2.did_run(is_changed), "is_changed should have been cached" + if server["should_cache_results"]: + assert not result2.did_run(is_changed), "is_changed should have been cached" + else: + assert result2.did_run(is_changed), "is_changed should have been re-run" assert result3.did_run(is_changed), "is_changed should have been re-run" assert result4.did_run(is_changed), "is_changed should not have been cached" @@ -514,7 +522,7 @@ class TestExecution: assert len(images2) == 1, "Should have 1 image" # This tests that only constant outputs are used in the call to `IS_CHANGED` - def test_is_changed_with_outputs(self, client: ComfyClient, builder: GraphBuilder): + def test_is_changed_with_outputs(self, client: ComfyClient, builder: GraphBuilder, server): g = builder input1 = g.node("StubConstantImage", value=0.5, height=512, width=512, batch_size=1) test_node = g.node("TestIsChangedWithConstants", image=input1.out(0), value=0.5) @@ -530,7 +538,11 @@ class TestExecution: images = result.get_images(output) assert len(images) == 1, "Should have 1 image" assert numpy.array(images[0]).min() == 63 and numpy.array(images[0]).max() == 63, "Image should have value 0.25" - assert not result.did_run(test_node), "The execution should have been cached" + if server["should_cache_results"]: + assert not result.did_run(test_node), "The execution should have been cached" + else: + assert result.did_run(test_node), "The execution should have been re-run" + def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): # Warmup execution to ensure server is fully initialized From 99ce2a1f66c4bcd500d76cc9a7430f7b2bf32776 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 18 Oct 2025 00:13:05 +0300 Subject: [PATCH 466/759] convert nodes_controlnet.py to V3 schema (#10202) --- comfy_extras/nodes_controlnet.py | 92 ++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 34 deletions(-) diff --git a/comfy_extras/nodes_controlnet.py b/comfy_extras/nodes_controlnet.py index 2d20e1fed..e835feed7 100644 --- a/comfy_extras/nodes_controlnet.py +++ b/comfy_extras/nodes_controlnet.py @@ -1,20 +1,26 @@ from comfy.cldm.control_types import UNION_CONTROLNET_TYPES import nodes import comfy.utils +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class SetUnionControlNetType: +class SetUnionControlNetType(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"control_net": ("CONTROL_NET", ), - "type": (["auto"] + list(UNION_CONTROLNET_TYPES.keys()),) - }} + def define_schema(cls): + return io.Schema( + node_id="SetUnionControlNetType", + category="conditioning/controlnet", + inputs=[ + io.ControlNet.Input("control_net"), + io.Combo.Input("type", options=["auto"] + list(UNION_CONTROLNET_TYPES.keys())), + ], + outputs=[ + io.ControlNet.Output(), + ], + ) - CATEGORY = "conditioning/controlnet" - RETURN_TYPES = ("CONTROL_NET",) - - FUNCTION = "set_controlnet_type" - - def set_controlnet_type(self, control_net, type): + @classmethod + def execute(cls, control_net, type) -> io.NodeOutput: control_net = control_net.copy() type_number = UNION_CONTROLNET_TYPES.get(type, -1) if type_number >= 0: @@ -22,27 +28,36 @@ class SetUnionControlNetType: else: control_net.set_extra_arg("control_type", []) - return (control_net,) + return io.NodeOutput(control_net) -class ControlNetInpaintingAliMamaApply(nodes.ControlNetApplyAdvanced): + set_controlnet_type = execute # TODO: remove + + +class ControlNetInpaintingAliMamaApply(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "control_net": ("CONTROL_NET", ), - "vae": ("VAE", ), - "image": ("IMAGE", ), - "mask": ("MASK", ), - "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}), - "end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001}) - }} + def define_schema(cls): + return io.Schema( + node_id="ControlNetInpaintingAliMamaApply", + category="conditioning/controlnet", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.ControlNet.Input("control_net"), + io.Vae.Input("vae"), + io.Image.Input("image"), + io.Mask.Input("mask"), + io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=1.0, min=0.0, max=1.0, step=0.001), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + ], + ) - FUNCTION = "apply_inpaint_controlnet" - - CATEGORY = "conditioning/controlnet" - - def apply_inpaint_controlnet(self, positive, negative, control_net, vae, image, mask, strength, start_percent, end_percent): + @classmethod + def execute(cls, positive, negative, control_net, vae, image, mask, strength, start_percent, end_percent) -> io.NodeOutput: extra_concat = [] if control_net.concat_mask: mask = 1.0 - mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])) @@ -50,11 +65,20 @@ class ControlNetInpaintingAliMamaApply(nodes.ControlNetApplyAdvanced): image = image * mask_apply.movedim(1, -1).repeat(1, 1, 1, image.shape[3]) extra_concat = [mask] - return self.apply_controlnet(positive, negative, control_net, image, strength, start_percent, end_percent, vae=vae, extra_concat=extra_concat) + result = nodes.ControlNetApplyAdvanced().apply_controlnet(positive, negative, control_net, image, strength, start_percent, end_percent, vae=vae, extra_concat=extra_concat) + return io.NodeOutput(result[0], result[1]) + + apply_inpaint_controlnet = execute # TODO: remove +class ControlNetExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + SetUnionControlNetType, + ControlNetInpaintingAliMamaApply, + ] -NODE_CLASS_MAPPINGS = { - "SetUnionControlNetType": SetUnionControlNetType, - "ControlNetInpaintingAliMamaApply": ControlNetInpaintingAliMamaApply, -} + +async def comfy_entrypoint() -> ControlNetExtension: + return ControlNetExtension() From 92d97380bd02d9883295aeb2d29365cecd9a765e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 17 Oct 2025 15:22:59 -0700 Subject: [PATCH 467/759] Update Python 3.14 installation instructions (#10385) Removed mention of installing pytorch nightly for Python 3.14. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b0731db33..c9a0644e3 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,7 @@ comfy install ## Manual Install (Windows, Linux) -Python 3.14 will work if you comment out the `kornia` dependency in the requirements.txt file (breaks the canny node) and install pytorch nightly but it is not recommended. +Python 3.14 will work if you comment out the `kornia` dependency in the requirements.txt file (breaks the canny node) but it is not recommended. Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12 From 9da397ea2f271080406f0c14cf4f0db7221ddf70 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 17 Oct 2025 17:03:28 -0700 Subject: [PATCH 468/759] Disable torch compiler for cast_bias_weight function (#10384) * Disable torch compiler for cast_bias_weight function * Fix torch compile. --- comfy/ops.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/comfy/ops.py b/comfy/ops.py index 56b07b44c..5feeb3571 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -25,6 +25,9 @@ import comfy.rmsnorm import contextlib def run_every_op(): + if torch.compiler.is_compiling(): + return + comfy.model_management.throw_exception_if_processing_interrupted() def scaled_dot_product_attention(q, k, v, *args, **kwargs): @@ -70,6 +73,7 @@ if torch.cuda.is_available() and torch.backends.cudnn.is_available() and Perform def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) +@torch.compiler.disable() def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None): if input is not None: if dtype is None: From 5b80addafd24bda5b2f9f7a35e32dbd40823c3fd Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 18 Oct 2025 19:35:46 -0700 Subject: [PATCH 469/759] Turn off cuda malloc by default when --fast autotune is turned on. (#10393) --- comfy/model_management.py | 3 +++ comfy/ops.py | 3 --- cuda_malloc.py | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index d82d5b8b0..7467391cd 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -371,6 +371,9 @@ try: except: pass +if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast: + torch.backends.cudnn.benchmark = True + try: if torch_version_numeric >= (2, 5): torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True) diff --git a/comfy/ops.py b/comfy/ops.py index 5feeb3571..967134f05 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -67,9 +67,6 @@ except: cast_to = comfy.model_management.cast_to #TODO: remove once no more references -if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast: - torch.backends.cudnn.benchmark = True - def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) diff --git a/cuda_malloc.py b/cuda_malloc.py index c1d9ae3ca..6520d5123 100644 --- a/cuda_malloc.py +++ b/cuda_malloc.py @@ -1,6 +1,6 @@ import os import importlib.util -from comfy.cli_args import args +from comfy.cli_args import args, PerformanceFeature import subprocess #Can't use pytorch to get the GPU names because the cuda malloc has to be set before the first import. @@ -75,8 +75,9 @@ if not args.cuda_malloc: spec.loader.exec_module(module) version = module.__version__ - if int(version[0]) >= 2 and "+cu" in version: #enable by default for torch version 2.0 and up only on cuda torch - args.cuda_malloc = cuda_malloc_supported() + if int(version[0]) >= 2 and "+cu" in version: # enable by default for torch version 2.0 and up only on cuda torch + if PerformanceFeature.AutoTune not in args.fast: # Autotune has issues with cuda malloc + args.cuda_malloc = cuda_malloc_supported() except: pass From 0cf33953a7c951d163088cbfe36c55d1cdf8a718 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 18 Oct 2025 20:15:34 -0700 Subject: [PATCH 470/759] Fix batch size above 1 giving bad output in chroma radiance. (#10394) --- comfy/ldm/chroma_radiance/model.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py index 47aa11b04..7d7be80f5 100644 --- a/comfy/ldm/chroma_radiance/model.py +++ b/comfy/ldm/chroma_radiance/model.py @@ -189,15 +189,15 @@ class ChromaRadiance(Chroma): nerf_pixels = nn.functional.unfold(img_orig, kernel_size=patch_size, stride=patch_size) nerf_pixels = nerf_pixels.transpose(1, 2) # -> [B, NumPatches, C * P * P] + # Reshape for per-patch processing + nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size) + nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2) + if params.nerf_tile_size > 0 and num_patches > params.nerf_tile_size: # Enable tiling if nerf_tile_size isn't 0 and we actually have more patches than # the tile size. - img_dct = self.forward_tiled_nerf(img_out, nerf_pixels, B, C, num_patches, patch_size, params) + img_dct = self.forward_tiled_nerf(nerf_hidden, nerf_pixels, B, C, num_patches, patch_size, params) else: - # Reshape for per-patch processing - nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size) - nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2) - # Get DCT-encoded pixel embeddings [pixel-dct] img_dct = self.nerf_image_embedder(nerf_pixels) @@ -240,17 +240,8 @@ class ChromaRadiance(Chroma): end = min(i + tile_size, num_patches) # Slice the current tile from the input tensors - nerf_hidden_tile = nerf_hidden[:, i:end, :] - nerf_pixels_tile = nerf_pixels[:, i:end, :] - - # Get the actual number of patches in this tile (can be smaller for the last tile) - num_patches_tile = nerf_hidden_tile.shape[1] - - # Reshape the tile for per-patch processing - # [B, NumPatches_tile, D] -> [B * NumPatches_tile, D] - nerf_hidden_tile = nerf_hidden_tile.reshape(batch * num_patches_tile, params.hidden_size) - # [B, NumPatches_tile, C*P*P] -> [B*NumPatches_tile, C, P*P] -> [B*NumPatches_tile, P*P, C] - nerf_pixels_tile = nerf_pixels_tile.reshape(batch * num_patches_tile, channels, patch_size**2).transpose(1, 2) + nerf_hidden_tile = nerf_hidden[i * batch:end * batch] + nerf_pixels_tile = nerf_pixels[i * batch:end * batch] # get DCT-encoded pixel embeddings [pixel-dct] img_dct_tile = self.nerf_image_embedder(nerf_pixels_tile) From dad076aee68ab676fb390d9663ab9e343824a080 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 18 Oct 2025 20:19:52 -0700 Subject: [PATCH 471/759] Speed up chroma radiance. (#10395) --- comfy/model_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 7677617c0..141f1e164 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -213,7 +213,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["nerf_mlp_ratio"] = 4 dit_config["nerf_depth"] = 4 dit_config["nerf_max_freqs"] = 8 - dit_config["nerf_tile_size"] = 32 + dit_config["nerf_tile_size"] = 512 dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear" dit_config["nerf_embedder_dtype"] = torch.float32 else: From b4f30bd4087a79b4c4fc89bb67b9889adb866294 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 18 Oct 2025 22:25:35 -0700 Subject: [PATCH 472/759] Pytorch is stupid. (#10398) --- comfy/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ops.py b/comfy/ops.py index 967134f05..934e21261 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -58,7 +58,7 @@ except (ModuleNotFoundError, TypeError): NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False try: if comfy.model_management.is_nvidia(): - if torch.backends.cudnn.version() >= 91200 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10): + if torch.backends.cudnn.version() >= 91002 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10): #TODO: change upper bound version once it's fixed' NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True logging.info("working around nvidia conv3d memory bug.") From b5c59b763c6b14e1362ec4274b09eca4f3f7091b Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Sun, 19 Oct 2025 13:05:46 -0700 Subject: [PATCH 473/759] Deprecation warning on unused files (#10387) * only warn for unused files * include internal extensions --- server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server.py b/server.py index a44f4f237..10c2698b5 100644 --- a/server.py +++ b/server.py @@ -56,7 +56,7 @@ async def deprecation_warning(request: web.Request, handler): """Middleware to warn about deprecated frontend API paths""" path = request.path - if (path.startswith('/scripts/') or path.startswith('/extensions/core/')): + if path.startswith("/scripts/ui") or path.startswith("/extensions/core/"): # Only warn once per unique file path if path not in _deprecated_paths_warned: _deprecated_paths_warned.add(path) From a4787ac83bf6c83eeb459ed80fc9b36f63d2a3a7 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 21 Oct 2025 03:28:36 +0800 Subject: [PATCH 474/759] Update template to 0.2.1 (#10413) * Update template to 0.1.97 * Update template to 0.2.1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 82457df54..dd2afcab0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.28.7 -comfyui-workflow-templates==0.1.95 +comfyui-workflow-templates==0.2.1 comfyui-embedded-docs==0.3.0 torch torchsde From 2c2aa409b01f513de88d2245931e5836ed1cd718 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 20 Oct 2025 12:43:24 -0700 Subject: [PATCH 475/759] Log message for cudnn disable on AMD. (#10418) --- comfy/model_management.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/model_management.py b/comfy/model_management.py index 7467391cd..a2c318ec3 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -333,6 +333,7 @@ SUPPORT_FP8_OPS = args.supports_fp8_compute try: if is_amd(): torch.backends.cudnn.enabled = False # Seems to improve things a lot on AMD + logging.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.") try: rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2])) except: From b7992f871af38d89a459080caa57cc359ed93a46 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 20 Oct 2025 16:03:06 -0700 Subject: [PATCH 476/759] =?UTF-8?q?Revert=20"execution:=20fold=20in=20depe?= =?UTF-8?q?ndency=20aware=20caching=20/=20Fix=20--cache-none=20with=20l?= =?UTF-8?q?=E2=80=A6"=20(#10422)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit b1467da4803017a418c32c159525767f45871ca3. --- comfy_execution/caching.py | 174 ++++++++++++++++++++++++++---- comfy_execution/graph.py | 31 +----- execution.py | 34 +++--- main.py | 2 +- tests/execution/test_execution.py | 50 ++++----- 5 files changed, 190 insertions(+), 101 deletions(-) diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py index 566bc3f9c..41224ce3b 100644 --- a/comfy_execution/caching.py +++ b/comfy_execution/caching.py @@ -265,26 +265,6 @@ class HierarchicalCache(BasicCache): assert cache is not None return await cache._ensure_subcache(node_id, children_ids) -class NullCache: - - async def set_prompt(self, dynprompt, node_ids, is_changed_cache): - pass - - def all_node_ids(self): - return [] - - def clean_unused(self): - pass - - def get(self, node_id): - return None - - def set(self, node_id, value): - pass - - async def ensure_subcache_for(self, node_id, children_ids): - return self - class LRUCache(BasicCache): def __init__(self, key_class, max_size=100): super().__init__(key_class) @@ -336,3 +316,157 @@ class LRUCache(BasicCache): self._mark_used(child_id) self.children[cache_key].append(self.cache_key_set.get_data_key(child_id)) return self + + +class DependencyAwareCache(BasicCache): + """ + A cache implementation that tracks dependencies between nodes and manages + their execution and caching accordingly. It extends the BasicCache class. + Nodes are removed from this cache once all of their descendants have been + executed. + """ + + def __init__(self, key_class): + """ + Initialize the DependencyAwareCache. + + Args: + key_class: The class used for generating cache keys. + """ + super().__init__(key_class) + self.descendants = {} # Maps node_id -> set of descendant node_ids + self.ancestors = {} # Maps node_id -> set of ancestor node_ids + self.executed_nodes = set() # Tracks nodes that have been executed + + async def set_prompt(self, dynprompt, node_ids, is_changed_cache): + """ + Clear the entire cache and rebuild the dependency graph. + + Args: + dynprompt: The dynamic prompt object containing node information. + node_ids: List of node IDs to initialize the cache for. + is_changed_cache: Flag indicating if the cache has changed. + """ + # Clear all existing cache data + self.cache.clear() + self.subcaches.clear() + self.descendants.clear() + self.ancestors.clear() + self.executed_nodes.clear() + + # Call the parent method to initialize the cache with the new prompt + await super().set_prompt(dynprompt, node_ids, is_changed_cache) + + # Rebuild the dependency graph + self._build_dependency_graph(dynprompt, node_ids) + + def _build_dependency_graph(self, dynprompt, node_ids): + """ + Build the dependency graph for all nodes. + + Args: + dynprompt: The dynamic prompt object containing node information. + node_ids: List of node IDs to build the graph for. + """ + self.descendants.clear() + self.ancestors.clear() + for node_id in node_ids: + self.descendants[node_id] = set() + self.ancestors[node_id] = set() + + for node_id in node_ids: + inputs = dynprompt.get_node(node_id)["inputs"] + for input_data in inputs.values(): + if is_link(input_data): # Check if the input is a link to another node + ancestor_id = input_data[0] + self.descendants[ancestor_id].add(node_id) + self.ancestors[node_id].add(ancestor_id) + + def set(self, node_id, value): + """ + Mark a node as executed and store its value in the cache. + + Args: + node_id: The ID of the node to store. + value: The value to store for the node. + """ + self._set_immediate(node_id, value) + self.executed_nodes.add(node_id) + self._cleanup_ancestors(node_id) + + def get(self, node_id): + """ + Retrieve the cached value for a node. + + Args: + node_id: The ID of the node to retrieve. + + Returns: + The cached value for the node. + """ + return self._get_immediate(node_id) + + async def ensure_subcache_for(self, node_id, children_ids): + """ + Ensure a subcache exists for a node and update dependencies. + + Args: + node_id: The ID of the parent node. + children_ids: List of child node IDs to associate with the parent node. + + Returns: + The subcache object for the node. + """ + subcache = await super()._ensure_subcache(node_id, children_ids) + for child_id in children_ids: + self.descendants[node_id].add(child_id) + self.ancestors[child_id].add(node_id) + return subcache + + def _cleanup_ancestors(self, node_id): + """ + Check if ancestors of a node can be removed from the cache. + + Args: + node_id: The ID of the node whose ancestors are to be checked. + """ + for ancestor_id in self.ancestors.get(node_id, []): + if ancestor_id in self.executed_nodes: + # Remove ancestor if all its descendants have been executed + if all(descendant in self.executed_nodes for descendant in self.descendants[ancestor_id]): + self._remove_node(ancestor_id) + + def _remove_node(self, node_id): + """ + Remove a node from the cache. + + Args: + node_id: The ID of the node to remove. + """ + cache_key = self.cache_key_set.get_data_key(node_id) + if cache_key in self.cache: + del self.cache[cache_key] + subcache_key = self.cache_key_set.get_subcache_key(node_id) + if subcache_key in self.subcaches: + del self.subcaches[subcache_key] + + def clean_unused(self): + """ + Clean up unused nodes. This is a no-op for this cache implementation. + """ + pass + + def recursive_debug_dump(self): + """ + Dump the cache and dependency graph for debugging. + + Returns: + A list containing the cache state and dependency graph. + """ + result = super().recursive_debug_dump() + result.append({ + "descendants": self.descendants, + "ancestors": self.ancestors, + "executed_nodes": list(self.executed_nodes), + }) + return result diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index d5bbacde3..f4b427265 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -153,9 +153,8 @@ class TopologicalSort: continue _, _, input_info = self.get_input_info(unique_id, input_name) is_lazy = input_info is not None and "lazy" in input_info and input_info["lazy"] - if (include_lazy or not is_lazy): - if not self.is_cached(from_node_id): - node_ids.append(from_node_id) + if (include_lazy or not is_lazy) and not self.is_cached(from_node_id): + node_ids.append(from_node_id) links.append((from_node_id, from_socket, unique_id)) for link in links: @@ -195,34 +194,10 @@ class ExecutionList(TopologicalSort): super().__init__(dynprompt) self.output_cache = output_cache self.staged_node_id = None - self.execution_cache = {} - self.execution_cache_listeners = {} def is_cached(self, node_id): return self.output_cache.get(node_id) is not None - def cache_link(self, from_node_id, to_node_id): - if not to_node_id in self.execution_cache: - self.execution_cache[to_node_id] = {} - self.execution_cache[to_node_id][from_node_id] = self.output_cache.get(from_node_id) - if not from_node_id in self.execution_cache_listeners: - self.execution_cache_listeners[from_node_id] = set() - self.execution_cache_listeners[from_node_id].add(to_node_id) - - def get_output_cache(self, from_node_id, to_node_id): - if not to_node_id in self.execution_cache: - return None - return self.execution_cache[to_node_id].get(from_node_id) - - def cache_update(self, node_id, value): - if node_id in self.execution_cache_listeners: - for to_node_id in self.execution_cache_listeners[node_id]: - self.execution_cache[to_node_id][node_id] = value - - def add_strong_link(self, from_node_id, from_socket, to_node_id): - super().add_strong_link(from_node_id, from_socket, to_node_id) - self.cache_link(from_node_id, to_node_id) - async def stage_node_execution(self): assert self.staged_node_id is None if self.is_empty(): @@ -302,8 +277,6 @@ class ExecutionList(TopologicalSort): def complete_node_execution(self): node_id = self.staged_node_id self.pop_node(node_id) - self.execution_cache.pop(node_id, None) - self.execution_cache_listeners.pop(node_id, None) self.staged_node_id = None def get_nodes_in_cycle(self): diff --git a/execution.py b/execution.py index 78c36a4b0..1dc35738b 100644 --- a/execution.py +++ b/execution.py @@ -18,7 +18,7 @@ from comfy_execution.caching import ( BasicCache, CacheKeySetID, CacheKeySetInputSignature, - NullCache, + DependencyAwareCache, HierarchicalCache, LRUCache, ) @@ -91,13 +91,13 @@ class IsChangedCache: class CacheType(Enum): CLASSIC = 0 LRU = 1 - NONE = 2 + DEPENDENCY_AWARE = 2 class CacheSet: def __init__(self, cache_type=None, cache_size=None): - if cache_type == CacheType.NONE: - self.init_null_cache() + if cache_type == CacheType.DEPENDENCY_AWARE: + self.init_dependency_aware_cache() logging.info("Disabling intermediate node cache.") elif cache_type == CacheType.LRU: if cache_size is None: @@ -120,12 +120,11 @@ class CacheSet: self.ui = LRUCache(CacheKeySetInputSignature, max_size=cache_size) self.objects = HierarchicalCache(CacheKeySetID) - def init_null_cache(self): - self.outputs = NullCache() - #The UI cache is expected to be iterable at the end of each workflow - #so it must cache at least a full workflow. Use Heirachical - self.ui = HierarchicalCache(CacheKeySetInputSignature) - self.objects = NullCache() + # only hold cached items while the decendents have not executed + def init_dependency_aware_cache(self): + self.outputs = DependencyAwareCache(CacheKeySetInputSignature) + self.ui = DependencyAwareCache(CacheKeySetInputSignature) + self.objects = DependencyAwareCache(CacheKeySetID) def recursive_debug_dump(self): result = { @@ -136,7 +135,7 @@ class CacheSet: SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org") -def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=None, extra_data={}): +def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, extra_data={}): is_v3 = issubclass(class_def, _ComfyNodeInternal) if is_v3: valid_inputs, schema = class_def.INPUT_TYPES(include_hidden=False, return_schema=True) @@ -154,10 +153,10 @@ def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt= if is_link(input_data) and (not input_info or not input_info.get("rawLink", False)): input_unique_id = input_data[0] output_index = input_data[1] - if execution_list is None: + if outputs is None: mark_missing() continue # This might be a lazily-evaluated input - cached_output = execution_list.get_output_cache(input_unique_id, unique_id) + cached_output = outputs.get(input_unique_id) if cached_output is None: mark_missing() continue @@ -406,7 +405,6 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, cached_output = caches.ui.get(unique_id) or {} server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_output.get("output",None), "prompt_id": prompt_id }, server.client_id) get_progress_state().finish_progress(unique_id) - execution_list.cache_update(unique_id, caches.outputs.get(unique_id)) return (ExecutionResult.SUCCESS, None, None) input_data_all = None @@ -436,7 +434,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, for r in result: if is_link(r): source_node, source_output = r[0], r[1] - node_output = execution_list.get_output_cache(source_node, unique_id)[source_output] + node_output = caches.outputs.get(source_node)[source_output] for o in node_output: resolved_output.append(o) @@ -448,7 +446,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, has_subgraph = False else: get_progress_state().start_progress(unique_id) - input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data) + input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data) if server.client_id is not None: server.last_node_id = display_node_id server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id) @@ -551,15 +549,11 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, subcache.clean_unused() for node_id in new_output_ids: execution_list.add_node(node_id) - execution_list.cache_link(node_id, unique_id) for link in new_output_links: execution_list.add_strong_link(link[0], link[1], unique_id) pending_subgraph_results[unique_id] = cached_outputs return (ExecutionResult.PENDING, None, None) - caches.outputs.set(unique_id, output_data) - execution_list.cache_update(unique_id, output_data) - except comfy.model_management.InterruptProcessingException as iex: logging.info("Processing interrupted") diff --git a/main.py b/main.py index 4b4c5dcc4..35857dba8 100644 --- a/main.py +++ b/main.py @@ -173,7 +173,7 @@ def prompt_worker(q, server_instance): if args.cache_lru > 0: cache_type = execution.CacheType.LRU elif args.cache_none: - cache_type = execution.CacheType.NONE + cache_type = execution.CacheType.DEPENDENCY_AWARE e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_size=args.cache_lru) last_gc_collect = 0 diff --git a/tests/execution/test_execution.py b/tests/execution/test_execution.py index ace0d2279..ef73ad9fd 100644 --- a/tests/execution/test_execution.py +++ b/tests/execution/test_execution.py @@ -152,12 +152,12 @@ class TestExecution: # Initialize server and client # @fixture(scope="class", autouse=True, params=[ - { "extra_args" : [], "should_cache_results" : True }, - { "extra_args" : ["--cache-lru", 0], "should_cache_results" : True }, - { "extra_args" : ["--cache-lru", 100], "should_cache_results" : True }, - { "extra_args" : ["--cache-none"], "should_cache_results" : False }, + # (use_lru, lru_size) + (False, 0), + (True, 0), + (True, 100), ]) - def server(self, args_pytest, request): + def _server(self, args_pytest, request): # Start server pargs = [ 'python','main.py', @@ -167,10 +167,12 @@ class TestExecution: '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml', '--cpu', ] - pargs += [ str(param) for param in request.param["extra_args"] ] + use_lru, lru_size = request.param + if use_lru: + pargs += ['--cache-lru', str(lru_size)] print("Running server with args:", pargs) # noqa: T201 p = subprocess.Popen(pargs) - yield request.param + yield p.kill() torch.cuda.empty_cache() @@ -191,7 +193,7 @@ class TestExecution: return comfy_client @fixture(scope="class", autouse=True) - def shared_client(self, args_pytest, server): + def shared_client(self, args_pytest, _server): client = self.start_client(args_pytest["listen"], args_pytest["port"]) yield client del client @@ -223,7 +225,7 @@ class TestExecution: assert result.did_run(mask) assert result.did_run(lazy_mix) - def test_full_cache(self, client: ComfyClient, builder: GraphBuilder, server): + def test_full_cache(self, client: ComfyClient, builder: GraphBuilder): g = builder input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) input2 = g.node("StubImage", content="NOISE", height=512, width=512, batch_size=1) @@ -235,12 +237,9 @@ class TestExecution: client.run(g) result2 = client.run(g) for node_id, node in g.nodes.items(): - if server["should_cache_results"]: - assert not result2.did_run(node), f"Node {node_id} ran, but should have been cached" - else: - assert result2.did_run(node), f"Node {node_id} was cached, but should have been run" + assert not result2.did_run(node), f"Node {node_id} ran, but should have been cached" - def test_partial_cache(self, client: ComfyClient, builder: GraphBuilder, server): + def test_partial_cache(self, client: ComfyClient, builder: GraphBuilder): g = builder input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) input2 = g.node("StubImage", content="NOISE", height=512, width=512, batch_size=1) @@ -252,12 +251,8 @@ class TestExecution: client.run(g) mask.inputs['value'] = 0.4 result2 = client.run(g) - if server["should_cache_results"]: - assert not result2.did_run(input1), "Input1 should have been cached" - assert not result2.did_run(input2), "Input2 should have been cached" - else: - assert result2.did_run(input1), "Input1 should have been rerun" - assert result2.did_run(input2), "Input2 should have been rerun" + assert not result2.did_run(input1), "Input1 should have been cached" + assert not result2.did_run(input2), "Input2 should have been cached" def test_error(self, client: ComfyClient, builder: GraphBuilder): g = builder @@ -416,7 +411,7 @@ class TestExecution: input2 = g.node("StubImage", id="removeme", content="WHITE", height=512, width=512, batch_size=1) client.run(g) - def test_custom_is_changed(self, client: ComfyClient, builder: GraphBuilder, server): + def test_custom_is_changed(self, client: ComfyClient, builder: GraphBuilder): g = builder # Creating the nodes in this specific order previously caused a bug save = g.node("SaveImage") @@ -432,10 +427,7 @@ class TestExecution: result3 = client.run(g) result4 = client.run(g) assert result1.did_run(is_changed), "is_changed should have been run" - if server["should_cache_results"]: - assert not result2.did_run(is_changed), "is_changed should have been cached" - else: - assert result2.did_run(is_changed), "is_changed should have been re-run" + assert not result2.did_run(is_changed), "is_changed should have been cached" assert result3.did_run(is_changed), "is_changed should have been re-run" assert result4.did_run(is_changed), "is_changed should not have been cached" @@ -522,7 +514,7 @@ class TestExecution: assert len(images2) == 1, "Should have 1 image" # This tests that only constant outputs are used in the call to `IS_CHANGED` - def test_is_changed_with_outputs(self, client: ComfyClient, builder: GraphBuilder, server): + def test_is_changed_with_outputs(self, client: ComfyClient, builder: GraphBuilder): g = builder input1 = g.node("StubConstantImage", value=0.5, height=512, width=512, batch_size=1) test_node = g.node("TestIsChangedWithConstants", image=input1.out(0), value=0.5) @@ -538,11 +530,7 @@ class TestExecution: images = result.get_images(output) assert len(images) == 1, "Should have 1 image" assert numpy.array(images[0]).min() == 63 and numpy.array(images[0]).max() == 63, "Image should have value 0.25" - if server["should_cache_results"]: - assert not result.did_run(test_node), "The execution should have been cached" - else: - assert result.did_run(test_node), "The execution should have been re-run" - + assert not result.did_run(test_node), "The execution should have been cached" def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): # Warmup execution to ensure server is fully initialized From 560b1bdfca77d9441ca2924fd9d6baa8dda05cd7 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 20 Oct 2025 15:44:38 -0400 Subject: [PATCH 477/759] ComfyUI version v0.3.66 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index d39c1fdc4..33a06bbb0 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.65" +__version__ = "0.3.66" diff --git a/pyproject.toml b/pyproject.toml index 653604e24..0c6b23a25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.65" +version = "0.3.66" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 9cdc64998f8990aed7688b0ebe89bc3b97733764 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 21 Oct 2025 16:15:23 -0700 Subject: [PATCH 478/759] Only disable cudnn on newer AMD GPUs. (#10437) --- comfy/model_management.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index a2c318ec3..79d6ff9d4 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -330,15 +330,21 @@ except: SUPPORT_FP8_OPS = args.supports_fp8_compute + +AMD_RDNA2_AND_OLDER_ARCH = ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"] + try: if is_amd(): - torch.backends.cudnn.enabled = False # Seems to improve things a lot on AMD - logging.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.") + arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName + if not (any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH)): + torch.backends.cudnn.enabled = False # Seems to improve things a lot on AMD + logging.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.") + try: rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2])) except: rocm_version = (6, -1) - arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName + logging.info("AMD arch: {}".format(arch)) logging.info("ROCm version: {}".format(rocm_version)) if args.use_split_cross_attention == False and args.use_quad_cross_attention == False: @@ -1331,7 +1337,7 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma if is_amd(): arch = torch.cuda.get_device_properties(device).gcnArchName - if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]): # RDNA2 and older don't support bf16 + if any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH): # RDNA2 and older don't support bf16 if manual_cast: return True return False From f13cff0be65e35d34876b173bba2fec6bd94746b Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Tue, 21 Oct 2025 20:16:16 -0700 Subject: [PATCH 479/759] Add custom node published subgraphs endpoint (#10438) * Add get_subgraphs_dir to ComfyExtension and PUBLISHED_SUBGRAPH_DIRS to nodes.py * Created initial endpoints, although the returned paths are a bit off currently * Fix path and actually return real data * Sanitize returned /api/global_subgraphs entries * Remove leftover function from early prototyping * Remove added whitespace * Add None check for sanitize_entry --- app/subgraph_manager.py | 112 ++++++++++++++++++++++++++++++++++++++++ server.py | 3 ++ 2 files changed, 115 insertions(+) create mode 100644 app/subgraph_manager.py diff --git a/app/subgraph_manager.py b/app/subgraph_manager.py new file mode 100644 index 000000000..dbe404541 --- /dev/null +++ b/app/subgraph_manager.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +from typing import TypedDict +import os +import folder_paths +import glob +from aiohttp import web +import hashlib + + +class Source: + custom_node = "custom_node" + +class SubgraphEntry(TypedDict): + source: str + """ + Source of subgraph - custom_nodes vs templates. + """ + path: str + """ + Relative path of the subgraph file. + For custom nodes, will be the relative directory like /subgraphs/.json + """ + name: str + """ + Name of subgraph file. + """ + info: CustomNodeSubgraphEntryInfo + """ + Additional info about subgraph; in the case of custom_nodes, will contain nodepack name + """ + data: str + +class CustomNodeSubgraphEntryInfo(TypedDict): + node_pack: str + """Node pack name.""" + +class SubgraphManager: + def __init__(self): + self.cached_custom_node_subgraphs: dict[SubgraphEntry] | None = None + + async def load_entry_data(self, entry: SubgraphEntry): + with open(entry['path'], 'r') as f: + entry['data'] = f.read() + return entry + + async def sanitize_entry(self, entry: SubgraphEntry | None, remove_data=False) -> SubgraphEntry | None: + if entry is None: + return None + entry = entry.copy() + entry.pop('path', None) + if remove_data: + entry.pop('data', None) + return entry + + async def sanitize_entries(self, entries: dict[str, SubgraphEntry], remove_data=False) -> dict[str, SubgraphEntry]: + entries = entries.copy() + for key in list(entries.keys()): + entries[key] = await self.sanitize_entry(entries[key], remove_data) + return entries + + async def get_custom_node_subgraphs(self, loadedModules, force_reload=False): + # if not forced to reload and cached, return cache + if not force_reload and self.cached_custom_node_subgraphs is not None: + return self.cached_custom_node_subgraphs + # Load subgraphs from custom nodes + subfolder = "subgraphs" + subgraphs_dict: dict[SubgraphEntry] = {} + + for folder in folder_paths.get_folder_paths("custom_nodes"): + pattern = os.path.join(folder, f"*/{subfolder}/*.json") + matched_files = glob.glob(pattern) + for file in matched_files: + # replace backslashes with forward slashes + file = file.replace('\\', '/') + info: CustomNodeSubgraphEntryInfo = { + "node_pack": "custom_nodes." + file.split('/')[-3] + } + source = Source.custom_node + # hash source + path to make sure id will be as unique as possible, but + # reproducible across backend reloads + id = hashlib.sha256(f"{source}{file}".encode()).hexdigest() + entry: SubgraphEntry = { + "source": Source.custom_node, + "name": os.path.splitext(os.path.basename(file))[0], + "path": file, + "info": info, + } + subgraphs_dict[id] = entry + self.cached_custom_node_subgraphs = subgraphs_dict + return subgraphs_dict + + async def get_custom_node_subgraph(self, id: str, loadedModules): + subgraphs = await self.get_custom_node_subgraphs(loadedModules) + entry: SubgraphEntry = subgraphs.get(id, None) + if entry is not None and entry.get('data', None) is None: + await self.load_entry_data(entry) + return entry + + def add_routes(self, routes, loadedModules): + @routes.get("/global_subgraphs") + async def get_global_subgraphs(request): + subgraphs_dict = await self.get_custom_node_subgraphs(loadedModules) + # NOTE: we may want to include other sources of global subgraphs such as templates in the future; + # that's the reasoning for the current implementation + return web.json_response(await self.sanitize_entries(subgraphs_dict, remove_data=True)) + + @routes.get("/global_subgraphs/{id}") + async def get_global_subgraph(request): + id = request.match_info.get("id", None) + subgraph = await self.get_custom_node_subgraph(id, loadedModules) + return web.json_response(await self.sanitize_entry(subgraph)) diff --git a/server.py b/server.py index 10c2698b5..fe58db286 100644 --- a/server.py +++ b/server.py @@ -35,6 +35,7 @@ from comfy_api.internal import _ComfyNodeInternal from app.user_manager import UserManager from app.model_manager import ModelFileManager from app.custom_node_manager import CustomNodeManager +from app.subgraph_manager import SubgraphManager from typing import Optional, Union from api_server.routes.internal.internal_routes import InternalRoutes from protocol import BinaryEventTypes @@ -173,6 +174,7 @@ class PromptServer(): self.user_manager = UserManager() self.model_file_manager = ModelFileManager() self.custom_node_manager = CustomNodeManager() + self.subgraph_manager = SubgraphManager() self.internal_routes = InternalRoutes(self) self.supports = ["custom_nodes_from_web"] self.prompt_queue = execution.PromptQueue(self) @@ -819,6 +821,7 @@ class PromptServer(): self.user_manager.add_routes(self.routes) self.model_file_manager.add_routes(self.routes) self.custom_node_manager.add_routes(self.routes, self.app, nodes.LOADED_MODULE_DIRS.items()) + self.subgraph_manager.add_routes(self.routes, nodes.LOADED_MODULE_DIRS.items()) self.app.add_subapp('/internal', self.internal_routes.get_app()) # Prefix every route with /api for easier matching for delegation. From 4739d7717fea56750d0ef98c64268d9c1e487d78 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Thu, 23 Oct 2025 05:49:05 +1000 Subject: [PATCH 480/759] execution: fold in dependency aware caching / Fix --cache-none with loops/lazy etc (Resubmit) (#10440) * execution: fold in dependency aware caching This makes --cache-none compatiable with lazy and expanded subgraphs. Currently the --cache-none option is powered by the DependencyAwareCache. The cache attempts to maintain a parallel copy of the execution list data structure, however it is only setup once at the start of execution and does not get meaninigful updates to the execution list. This causes multiple problems when --cache-none is used with lazy and expanded subgraphs as the DAC does not accurately update its copy of the execution data structure. DAC has an attempt to handle subgraphs ensure_subcache however this does not accurately connect to nodes outside the subgraph. The current semantics of DAC are to free a node ASAP after the dependent nodes are executed. This means that if a subgraph refs such a node it will be requed and re-executed by the execution_list but DAC wont see it in its to-free lists anymore and leak memory. Rather than try and cover all the cases where the execution list changes from inside the cache, move the while problem to the executor which maintains an always up-to-date copy of the wanted data-structure. The executor now has a fast-moving run-local cache of its own. Each _to node has its own mini cache, and the cache is unconditionally primed at the time of add_strong_link. add_strong_link is called for all of static workflows, lazy links and expanded subgraphs so its the singular source of truth for output dependendencies. In the case of a cache-hit, the executor cache will hold the non-none value (it will respect updates if they happen somehow as well). In the case of a cache-miss, the executor caches a None and will wait for a notification to update the value when the node completes. When a node completes execution, it simply releases its mini-cache and in turn its strong refs on its direct anscestor outputs, allowing for ASAP freeing (same as the DependencyAwareCache but a little more automatic). This now allows for re-implementation of --cache-none with no cache at all. The dependency aware cache was also observing the dependency sematics for the objects and UI cache which is not accurate (this entire logic was always outputs specific). This also prepares for more complex caching strategies (such as RAM pressure based caching), where a cache can implement any freeing strategy completely independently of the DepedancyAwareness requirement. * main: re-implement --cache-none as no cache at all The execution list now tracks the dependency aware caching more correctly that the DependancyAwareCache. Change it to a cache that does nothing. * test_execution: add --cache-none to the test suite --cache-none is now expected to work universally. Run it through the full unit test suite. Propagate the server parameterization for whether or not the server is capabale of caching, so that the minority of tests that specifically check for cache hits can if else. Hard assert NOT caching in the else to give some coverage of --cache-none expected behaviour to not acutally cache. --- comfy_execution/caching.py | 174 ++++-------------------------- comfy_execution/graph.py | 32 +++++- execution.py | 34 +++--- main.py | 2 +- tests/execution/test_execution.py | 50 +++++---- 5 files changed, 102 insertions(+), 190 deletions(-) diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py index 41224ce3b..566bc3f9c 100644 --- a/comfy_execution/caching.py +++ b/comfy_execution/caching.py @@ -265,6 +265,26 @@ class HierarchicalCache(BasicCache): assert cache is not None return await cache._ensure_subcache(node_id, children_ids) +class NullCache: + + async def set_prompt(self, dynprompt, node_ids, is_changed_cache): + pass + + def all_node_ids(self): + return [] + + def clean_unused(self): + pass + + def get(self, node_id): + return None + + def set(self, node_id, value): + pass + + async def ensure_subcache_for(self, node_id, children_ids): + return self + class LRUCache(BasicCache): def __init__(self, key_class, max_size=100): super().__init__(key_class) @@ -316,157 +336,3 @@ class LRUCache(BasicCache): self._mark_used(child_id) self.children[cache_key].append(self.cache_key_set.get_data_key(child_id)) return self - - -class DependencyAwareCache(BasicCache): - """ - A cache implementation that tracks dependencies between nodes and manages - their execution and caching accordingly. It extends the BasicCache class. - Nodes are removed from this cache once all of their descendants have been - executed. - """ - - def __init__(self, key_class): - """ - Initialize the DependencyAwareCache. - - Args: - key_class: The class used for generating cache keys. - """ - super().__init__(key_class) - self.descendants = {} # Maps node_id -> set of descendant node_ids - self.ancestors = {} # Maps node_id -> set of ancestor node_ids - self.executed_nodes = set() # Tracks nodes that have been executed - - async def set_prompt(self, dynprompt, node_ids, is_changed_cache): - """ - Clear the entire cache and rebuild the dependency graph. - - Args: - dynprompt: The dynamic prompt object containing node information. - node_ids: List of node IDs to initialize the cache for. - is_changed_cache: Flag indicating if the cache has changed. - """ - # Clear all existing cache data - self.cache.clear() - self.subcaches.clear() - self.descendants.clear() - self.ancestors.clear() - self.executed_nodes.clear() - - # Call the parent method to initialize the cache with the new prompt - await super().set_prompt(dynprompt, node_ids, is_changed_cache) - - # Rebuild the dependency graph - self._build_dependency_graph(dynprompt, node_ids) - - def _build_dependency_graph(self, dynprompt, node_ids): - """ - Build the dependency graph for all nodes. - - Args: - dynprompt: The dynamic prompt object containing node information. - node_ids: List of node IDs to build the graph for. - """ - self.descendants.clear() - self.ancestors.clear() - for node_id in node_ids: - self.descendants[node_id] = set() - self.ancestors[node_id] = set() - - for node_id in node_ids: - inputs = dynprompt.get_node(node_id)["inputs"] - for input_data in inputs.values(): - if is_link(input_data): # Check if the input is a link to another node - ancestor_id = input_data[0] - self.descendants[ancestor_id].add(node_id) - self.ancestors[node_id].add(ancestor_id) - - def set(self, node_id, value): - """ - Mark a node as executed and store its value in the cache. - - Args: - node_id: The ID of the node to store. - value: The value to store for the node. - """ - self._set_immediate(node_id, value) - self.executed_nodes.add(node_id) - self._cleanup_ancestors(node_id) - - def get(self, node_id): - """ - Retrieve the cached value for a node. - - Args: - node_id: The ID of the node to retrieve. - - Returns: - The cached value for the node. - """ - return self._get_immediate(node_id) - - async def ensure_subcache_for(self, node_id, children_ids): - """ - Ensure a subcache exists for a node and update dependencies. - - Args: - node_id: The ID of the parent node. - children_ids: List of child node IDs to associate with the parent node. - - Returns: - The subcache object for the node. - """ - subcache = await super()._ensure_subcache(node_id, children_ids) - for child_id in children_ids: - self.descendants[node_id].add(child_id) - self.ancestors[child_id].add(node_id) - return subcache - - def _cleanup_ancestors(self, node_id): - """ - Check if ancestors of a node can be removed from the cache. - - Args: - node_id: The ID of the node whose ancestors are to be checked. - """ - for ancestor_id in self.ancestors.get(node_id, []): - if ancestor_id in self.executed_nodes: - # Remove ancestor if all its descendants have been executed - if all(descendant in self.executed_nodes for descendant in self.descendants[ancestor_id]): - self._remove_node(ancestor_id) - - def _remove_node(self, node_id): - """ - Remove a node from the cache. - - Args: - node_id: The ID of the node to remove. - """ - cache_key = self.cache_key_set.get_data_key(node_id) - if cache_key in self.cache: - del self.cache[cache_key] - subcache_key = self.cache_key_set.get_subcache_key(node_id) - if subcache_key in self.subcaches: - del self.subcaches[subcache_key] - - def clean_unused(self): - """ - Clean up unused nodes. This is a no-op for this cache implementation. - """ - pass - - def recursive_debug_dump(self): - """ - Dump the cache and dependency graph for debugging. - - Returns: - A list containing the cache state and dependency graph. - """ - result = super().recursive_debug_dump() - result.append({ - "descendants": self.descendants, - "ancestors": self.ancestors, - "executed_nodes": list(self.executed_nodes), - }) - return result diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index f4b427265..341c9735d 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -153,8 +153,9 @@ class TopologicalSort: continue _, _, input_info = self.get_input_info(unique_id, input_name) is_lazy = input_info is not None and "lazy" in input_info and input_info["lazy"] - if (include_lazy or not is_lazy) and not self.is_cached(from_node_id): - node_ids.append(from_node_id) + if (include_lazy or not is_lazy): + if not self.is_cached(from_node_id): + node_ids.append(from_node_id) links.append((from_node_id, from_socket, unique_id)) for link in links: @@ -194,10 +195,35 @@ class ExecutionList(TopologicalSort): super().__init__(dynprompt) self.output_cache = output_cache self.staged_node_id = None + self.execution_cache = {} + self.execution_cache_listeners = {} def is_cached(self, node_id): return self.output_cache.get(node_id) is not None + def cache_link(self, from_node_id, to_node_id): + if not to_node_id in self.execution_cache: + self.execution_cache[to_node_id] = {} + self.execution_cache[to_node_id][from_node_id] = self.output_cache.get(from_node_id) + if not from_node_id in self.execution_cache_listeners: + self.execution_cache_listeners[from_node_id] = set() + self.execution_cache_listeners[from_node_id].add(to_node_id) + + def get_output_cache(self, from_node_id, to_node_id): + if not to_node_id in self.execution_cache: + return None + return self.execution_cache[to_node_id].get(from_node_id) + + def cache_update(self, node_id, value): + if node_id in self.execution_cache_listeners: + for to_node_id in self.execution_cache_listeners[node_id]: + if to_node_id in self.execution_cache: + self.execution_cache[to_node_id][node_id] = value + + def add_strong_link(self, from_node_id, from_socket, to_node_id): + super().add_strong_link(from_node_id, from_socket, to_node_id) + self.cache_link(from_node_id, to_node_id) + async def stage_node_execution(self): assert self.staged_node_id is None if self.is_empty(): @@ -277,6 +303,8 @@ class ExecutionList(TopologicalSort): def complete_node_execution(self): node_id = self.staged_node_id self.pop_node(node_id) + self.execution_cache.pop(node_id, None) + self.execution_cache_listeners.pop(node_id, None) self.staged_node_id = None def get_nodes_in_cycle(self): diff --git a/execution.py b/execution.py index 1dc35738b..78c36a4b0 100644 --- a/execution.py +++ b/execution.py @@ -18,7 +18,7 @@ from comfy_execution.caching import ( BasicCache, CacheKeySetID, CacheKeySetInputSignature, - DependencyAwareCache, + NullCache, HierarchicalCache, LRUCache, ) @@ -91,13 +91,13 @@ class IsChangedCache: class CacheType(Enum): CLASSIC = 0 LRU = 1 - DEPENDENCY_AWARE = 2 + NONE = 2 class CacheSet: def __init__(self, cache_type=None, cache_size=None): - if cache_type == CacheType.DEPENDENCY_AWARE: - self.init_dependency_aware_cache() + if cache_type == CacheType.NONE: + self.init_null_cache() logging.info("Disabling intermediate node cache.") elif cache_type == CacheType.LRU: if cache_size is None: @@ -120,11 +120,12 @@ class CacheSet: self.ui = LRUCache(CacheKeySetInputSignature, max_size=cache_size) self.objects = HierarchicalCache(CacheKeySetID) - # only hold cached items while the decendents have not executed - def init_dependency_aware_cache(self): - self.outputs = DependencyAwareCache(CacheKeySetInputSignature) - self.ui = DependencyAwareCache(CacheKeySetInputSignature) - self.objects = DependencyAwareCache(CacheKeySetID) + def init_null_cache(self): + self.outputs = NullCache() + #The UI cache is expected to be iterable at the end of each workflow + #so it must cache at least a full workflow. Use Heirachical + self.ui = HierarchicalCache(CacheKeySetInputSignature) + self.objects = NullCache() def recursive_debug_dump(self): result = { @@ -135,7 +136,7 @@ class CacheSet: SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org") -def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, extra_data={}): +def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=None, extra_data={}): is_v3 = issubclass(class_def, _ComfyNodeInternal) if is_v3: valid_inputs, schema = class_def.INPUT_TYPES(include_hidden=False, return_schema=True) @@ -153,10 +154,10 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e if is_link(input_data) and (not input_info or not input_info.get("rawLink", False)): input_unique_id = input_data[0] output_index = input_data[1] - if outputs is None: + if execution_list is None: mark_missing() continue # This might be a lazily-evaluated input - cached_output = outputs.get(input_unique_id) + cached_output = execution_list.get_output_cache(input_unique_id, unique_id) if cached_output is None: mark_missing() continue @@ -405,6 +406,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, cached_output = caches.ui.get(unique_id) or {} server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_output.get("output",None), "prompt_id": prompt_id }, server.client_id) get_progress_state().finish_progress(unique_id) + execution_list.cache_update(unique_id, caches.outputs.get(unique_id)) return (ExecutionResult.SUCCESS, None, None) input_data_all = None @@ -434,7 +436,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, for r in result: if is_link(r): source_node, source_output = r[0], r[1] - node_output = caches.outputs.get(source_node)[source_output] + node_output = execution_list.get_output_cache(source_node, unique_id)[source_output] for o in node_output: resolved_output.append(o) @@ -446,7 +448,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, has_subgraph = False else: get_progress_state().start_progress(unique_id) - input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data) + input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data) if server.client_id is not None: server.last_node_id = display_node_id server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id) @@ -549,11 +551,15 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, subcache.clean_unused() for node_id in new_output_ids: execution_list.add_node(node_id) + execution_list.cache_link(node_id, unique_id) for link in new_output_links: execution_list.add_strong_link(link[0], link[1], unique_id) pending_subgraph_results[unique_id] = cached_outputs return (ExecutionResult.PENDING, None, None) + caches.outputs.set(unique_id, output_data) + execution_list.cache_update(unique_id, output_data) + except comfy.model_management.InterruptProcessingException as iex: logging.info("Processing interrupted") diff --git a/main.py b/main.py index 35857dba8..4b4c5dcc4 100644 --- a/main.py +++ b/main.py @@ -173,7 +173,7 @@ def prompt_worker(q, server_instance): if args.cache_lru > 0: cache_type = execution.CacheType.LRU elif args.cache_none: - cache_type = execution.CacheType.DEPENDENCY_AWARE + cache_type = execution.CacheType.NONE e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_size=args.cache_lru) last_gc_collect = 0 diff --git a/tests/execution/test_execution.py b/tests/execution/test_execution.py index ef73ad9fd..ace0d2279 100644 --- a/tests/execution/test_execution.py +++ b/tests/execution/test_execution.py @@ -152,12 +152,12 @@ class TestExecution: # Initialize server and client # @fixture(scope="class", autouse=True, params=[ - # (use_lru, lru_size) - (False, 0), - (True, 0), - (True, 100), + { "extra_args" : [], "should_cache_results" : True }, + { "extra_args" : ["--cache-lru", 0], "should_cache_results" : True }, + { "extra_args" : ["--cache-lru", 100], "should_cache_results" : True }, + { "extra_args" : ["--cache-none"], "should_cache_results" : False }, ]) - def _server(self, args_pytest, request): + def server(self, args_pytest, request): # Start server pargs = [ 'python','main.py', @@ -167,12 +167,10 @@ class TestExecution: '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml', '--cpu', ] - use_lru, lru_size = request.param - if use_lru: - pargs += ['--cache-lru', str(lru_size)] + pargs += [ str(param) for param in request.param["extra_args"] ] print("Running server with args:", pargs) # noqa: T201 p = subprocess.Popen(pargs) - yield + yield request.param p.kill() torch.cuda.empty_cache() @@ -193,7 +191,7 @@ class TestExecution: return comfy_client @fixture(scope="class", autouse=True) - def shared_client(self, args_pytest, _server): + def shared_client(self, args_pytest, server): client = self.start_client(args_pytest["listen"], args_pytest["port"]) yield client del client @@ -225,7 +223,7 @@ class TestExecution: assert result.did_run(mask) assert result.did_run(lazy_mix) - def test_full_cache(self, client: ComfyClient, builder: GraphBuilder): + def test_full_cache(self, client: ComfyClient, builder: GraphBuilder, server): g = builder input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) input2 = g.node("StubImage", content="NOISE", height=512, width=512, batch_size=1) @@ -237,9 +235,12 @@ class TestExecution: client.run(g) result2 = client.run(g) for node_id, node in g.nodes.items(): - assert not result2.did_run(node), f"Node {node_id} ran, but should have been cached" + if server["should_cache_results"]: + assert not result2.did_run(node), f"Node {node_id} ran, but should have been cached" + else: + assert result2.did_run(node), f"Node {node_id} was cached, but should have been run" - def test_partial_cache(self, client: ComfyClient, builder: GraphBuilder): + def test_partial_cache(self, client: ComfyClient, builder: GraphBuilder, server): g = builder input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) input2 = g.node("StubImage", content="NOISE", height=512, width=512, batch_size=1) @@ -251,8 +252,12 @@ class TestExecution: client.run(g) mask.inputs['value'] = 0.4 result2 = client.run(g) - assert not result2.did_run(input1), "Input1 should have been cached" - assert not result2.did_run(input2), "Input2 should have been cached" + if server["should_cache_results"]: + assert not result2.did_run(input1), "Input1 should have been cached" + assert not result2.did_run(input2), "Input2 should have been cached" + else: + assert result2.did_run(input1), "Input1 should have been rerun" + assert result2.did_run(input2), "Input2 should have been rerun" def test_error(self, client: ComfyClient, builder: GraphBuilder): g = builder @@ -411,7 +416,7 @@ class TestExecution: input2 = g.node("StubImage", id="removeme", content="WHITE", height=512, width=512, batch_size=1) client.run(g) - def test_custom_is_changed(self, client: ComfyClient, builder: GraphBuilder): + def test_custom_is_changed(self, client: ComfyClient, builder: GraphBuilder, server): g = builder # Creating the nodes in this specific order previously caused a bug save = g.node("SaveImage") @@ -427,7 +432,10 @@ class TestExecution: result3 = client.run(g) result4 = client.run(g) assert result1.did_run(is_changed), "is_changed should have been run" - assert not result2.did_run(is_changed), "is_changed should have been cached" + if server["should_cache_results"]: + assert not result2.did_run(is_changed), "is_changed should have been cached" + else: + assert result2.did_run(is_changed), "is_changed should have been re-run" assert result3.did_run(is_changed), "is_changed should have been re-run" assert result4.did_run(is_changed), "is_changed should not have been cached" @@ -514,7 +522,7 @@ class TestExecution: assert len(images2) == 1, "Should have 1 image" # This tests that only constant outputs are used in the call to `IS_CHANGED` - def test_is_changed_with_outputs(self, client: ComfyClient, builder: GraphBuilder): + def test_is_changed_with_outputs(self, client: ComfyClient, builder: GraphBuilder, server): g = builder input1 = g.node("StubConstantImage", value=0.5, height=512, width=512, batch_size=1) test_node = g.node("TestIsChangedWithConstants", image=input1.out(0), value=0.5) @@ -530,7 +538,11 @@ class TestExecution: images = result.get_images(output) assert len(images) == 1, "Should have 1 image" assert numpy.array(images[0]).min() == 63 and numpy.array(images[0]).max() == 63, "Image should have value 0.25" - assert not result.did_run(test_node), "The execution should have been cached" + if server["should_cache_results"]: + assert not result.did_run(test_node), "The execution should have been cached" + else: + assert result.did_run(test_node), "The execution should have been re-run" + def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): # Warmup execution to ensure server is fully initialized From a1864c01f29cc43fe6bf823fc3fd46ba2781c2e0 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 22 Oct 2025 14:26:22 -0700 Subject: [PATCH 481/759] Small readme improvement. (#10442) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c9a0644e3..434d4ff06 100644 --- a/README.md +++ b/README.md @@ -201,6 +201,8 @@ Python 3.14 will work if you comment out the `kornia` dependency in the requirem Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12 +### Instructions: + Git clone this repo. Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints From 1bcda6df987a6c92b39d8b6d29e0b029450d67d0 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 23 Oct 2025 18:21:14 -0700 Subject: [PATCH 482/759] WIP way to support multi multi dimensional latents. (#10456) --- comfy/model_base.py | 10 ++++- comfy/nested_tensor.py | 91 ++++++++++++++++++++++++++++++++++++++++++ comfy/sample.py | 27 ++++++++++--- comfy/samplers.py | 23 +++++++---- comfy/utils.py | 22 ++++++++++ 5 files changed, 158 insertions(+), 15 deletions(-) create mode 100644 comfy/nested_tensor.py diff --git a/comfy/model_base.py b/comfy/model_base.py index 8274c7dea..e877f19ac 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -197,8 +197,14 @@ class BaseModel(torch.nn.Module): extra_conds[o] = extra t = self.process_timestep(t, x=x, **extra_conds) - model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float() - return self.model_sampling.calculate_denoised(sigma, model_output, x) + if "latent_shapes" in extra_conds: + xc = utils.unpack_latents(xc, extra_conds.pop("latent_shapes")) + + model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds) + if len(model_output) > 1 and not torch.is_tensor(model_output): + model_output, _ = utils.pack_latents(model_output) + + return self.model_sampling.calculate_denoised(sigma, model_output.float(), x) def process_timestep(self, timestep, **kwargs): return timestep diff --git a/comfy/nested_tensor.py b/comfy/nested_tensor.py new file mode 100644 index 000000000..b700816fa --- /dev/null +++ b/comfy/nested_tensor.py @@ -0,0 +1,91 @@ +import torch + +class NestedTensor: + def __init__(self, tensors): + self.tensors = list(tensors) + self.is_nested = True + + def _copy(self): + return NestedTensor(self.tensors) + + def apply_operation(self, other, operation): + o = self._copy() + if isinstance(other, NestedTensor): + for i, t in enumerate(o.tensors): + o.tensors[i] = operation(t, other.tensors[i]) + else: + for i, t in enumerate(o.tensors): + o.tensors[i] = operation(t, other) + return o + + def __add__(self, b): + return self.apply_operation(b, lambda x, y: x + y) + + def __sub__(self, b): + return self.apply_operation(b, lambda x, y: x - y) + + def __mul__(self, b): + return self.apply_operation(b, lambda x, y: x * y) + + # def __itruediv__(self, b): + # return self.apply_operation(b, lambda x, y: x / y) + + def __truediv__(self, b): + return self.apply_operation(b, lambda x, y: x / y) + + def __getitem__(self, *args, **kwargs): + return self.apply_operation(None, lambda x, y: x.__getitem__(*args, **kwargs)) + + def unbind(self): + return self.tensors + + def to(self, *args, **kwargs): + o = self._copy() + for i, t in enumerate(o.tensors): + o.tensors[i] = t.to(*args, **kwargs) + return o + + def new_ones(self, *args, **kwargs): + return self.tensors[0].new_ones(*args, **kwargs) + + def float(self): + return self.to(dtype=torch.float) + + def chunk(self, *args, **kwargs): + return self.apply_operation(None, lambda x, y: x.chunk(*args, **kwargs)) + + def size(self): + return self.tensors[0].size() + + @property + def shape(self): + return self.tensors[0].shape + + @property + def ndim(self): + dims = 0 + for t in self.tensors: + dims = max(t.ndim, dims) + return dims + + @property + def device(self): + return self.tensors[0].device + + @property + def dtype(self): + return self.tensors[0].dtype + + @property + def layout(self): + return self.tensors[0].layout + + +def cat_nested(tensors, *args, **kwargs): + cated_tensors = [] + for i in range(len(tensors[0].tensors)): + tens = [] + for j in range(len(tensors)): + tens.append(tensors[j].tensors[i]) + cated_tensors.append(torch.cat(tens, *args, **kwargs)) + return NestedTensor(cated_tensors) diff --git a/comfy/sample.py b/comfy/sample.py index be5a7e246..b1395da84 100644 --- a/comfy/sample.py +++ b/comfy/sample.py @@ -4,13 +4,9 @@ import comfy.samplers import comfy.utils import numpy as np import logging +import comfy.nested_tensor -def prepare_noise(latent_image, seed, noise_inds=None): - """ - creates random noise given a latent image and a seed. - optional arg skip can be used to skip and discard x number of noise generations for a given seed - """ - generator = torch.manual_seed(seed) +def prepare_noise_inner(latent_image, generator, noise_inds=None): if noise_inds is None: return torch.randn(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu") @@ -22,9 +18,28 @@ def prepare_noise(latent_image, seed, noise_inds=None): noises.append(noise) noises = [noises[i] for i in inverse] noises = torch.cat(noises, axis=0) + +def prepare_noise(latent_image, seed, noise_inds=None): + """ + creates random noise given a latent image and a seed. + optional arg skip can be used to skip and discard x number of noise generations for a given seed + """ + generator = torch.manual_seed(seed) + + if latent_image.is_nested: + tensors = latent_image.unbind() + noises = [] + for t in tensors: + noises.append(prepare_noise_inner(t, generator, noise_inds)) + noises = comfy.nested_tensor.NestedTensor(noises) + else: + noises = prepare_noise_inner(latent_image, generator, noise_inds) + return noises def fix_empty_latent_channels(model, latent_image): + if latent_image.is_nested: + return latent_image latent_format = model.get_model_object("latent_format") #Resize the empty latent image so it has the right number of channels if latent_format.latent_channels != latent_image.shape[1] and torch.count_nonzero(latent_image) == 0: latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1) diff --git a/comfy/samplers.py b/comfy/samplers.py index e7efaf470..fa4640842 100755 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -782,7 +782,7 @@ def ksampler(sampler_name, extra_options={}, inpaint_options={}): return KSAMPLER(sampler_function, extra_options, inpaint_options) -def process_conds(model, noise, conds, device, latent_image=None, denoise_mask=None, seed=None): +def process_conds(model, noise, conds, device, latent_image=None, denoise_mask=None, seed=None, latent_shapes=None): for k in conds: conds[k] = conds[k][:] resolve_areas_and_cond_masks_multidim(conds[k], noise.shape[2:], device) @@ -792,7 +792,7 @@ def process_conds(model, noise, conds, device, latent_image=None, denoise_mask=N if hasattr(model, 'extra_conds'): for k in conds: - conds[k] = encode_model_conds(model.extra_conds, conds[k], noise, device, k, latent_image=latent_image, denoise_mask=denoise_mask, seed=seed) + conds[k] = encode_model_conds(model.extra_conds, conds[k], noise, device, k, latent_image=latent_image, denoise_mask=denoise_mask, seed=seed, latent_shapes=latent_shapes) #make sure each cond area has an opposite one with the same area for k in conds: @@ -962,11 +962,11 @@ class CFGGuider: def predict_noise(self, x, timestep, model_options={}, seed=None): return sampling_function(self.inner_model, x, timestep, self.conds.get("negative", None), self.conds.get("positive", None), self.cfg, model_options=model_options, seed=seed) - def inner_sample(self, noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed): + def inner_sample(self, noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=None): if latent_image is not None and torch.count_nonzero(latent_image) > 0: #Don't shift the empty latent image. latent_image = self.inner_model.process_latent_in(latent_image) - self.conds = process_conds(self.inner_model, noise, self.conds, device, latent_image, denoise_mask, seed) + self.conds = process_conds(self.inner_model, noise, self.conds, device, latent_image, denoise_mask, seed, latent_shapes=latent_shapes) extra_model_options = comfy.model_patcher.create_model_options_clone(self.model_options) extra_model_options.setdefault("transformer_options", {})["sample_sigmas"] = sigmas @@ -980,7 +980,7 @@ class CFGGuider: samples = executor.execute(self, sigmas, extra_args, callback, noise, latent_image, denoise_mask, disable_pbar) return self.inner_model.process_latent_out(samples.to(torch.float32)) - def outer_sample(self, noise, latent_image, sampler, sigmas, denoise_mask=None, callback=None, disable_pbar=False, seed=None): + def outer_sample(self, noise, latent_image, sampler, sigmas, denoise_mask=None, callback=None, disable_pbar=False, seed=None, latent_shapes=None): self.inner_model, self.conds, self.loaded_models = comfy.sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options) device = self.model_patcher.load_device @@ -994,7 +994,7 @@ class CFGGuider: try: self.model_patcher.pre_run() - output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed) + output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes) finally: self.model_patcher.cleanup() @@ -1007,6 +1007,12 @@ class CFGGuider: if sigmas.shape[-1] == 0: return latent_image + if latent_image.is_nested: + latent_image, latent_shapes = comfy.utils.pack_latents(latent_image.unbind()) + noise, _ = comfy.utils.pack_latents(noise.unbind()) + else: + latent_shapes = [latent_image.shape] + self.conds = {} for k in self.original_conds: self.conds[k] = list(map(lambda a: a.copy(), self.original_conds[k])) @@ -1026,7 +1032,7 @@ class CFGGuider: self, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, self.model_options, is_model_options=True) ) - output = executor.execute(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed) + output = executor.execute(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes) finally: cast_to_load_options(self.model_options, device=self.model_patcher.offload_device) self.model_options = orig_model_options @@ -1034,6 +1040,9 @@ class CFGGuider: self.model_patcher.restore_hook_patches() del self.conds + + if len(latent_shapes) > 1: + output = comfy.nested_tensor.NestedTensor(comfy.utils.unpack_latents(output, latent_shapes)) return output diff --git a/comfy/utils.py b/comfy/utils.py index 0fd03f165..4bd281057 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -1106,3 +1106,25 @@ def upscale_dit_mask(mask: torch.Tensor, img_size_in, img_size_out): dim=1 ) return out + +def pack_latents(latents): + latent_shapes = [] + tensors = [] + for tensor in latents: + latent_shapes.append(tensor.shape) + tensors.append(tensor.reshape(tensor.shape[0], 1, -1)) + + latent = torch.cat(tensors, dim=-1) + return latent, latent_shapes + +def unpack_latents(combined_latent, latent_shapes): + if len(latent_shapes) > 1: + output_tensors = [] + for shape in latent_shapes: + cut = math.prod(shape[1:]) + tens = combined_latent[:, :, :cut] + combined_latent = combined_latent[:, :, cut:] + output_tensors.append(tens.reshape([tens.shape[0]] + list(shape)[1:])) + else: + output_tensors = combined_latent + return output_tensors From 24188b3141aace272cb91b85578c76f5a8f70e1c Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Fri, 24 Oct 2025 13:36:30 +0800 Subject: [PATCH 483/759] Update template to 0.2.2 (#10461) Fix template typo issue --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index dd2afcab0..8570c66b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.28.7 -comfyui-workflow-templates==0.2.1 +comfyui-workflow-templates==0.2.2 comfyui-embedded-docs==0.3.0 torch torchsde From 388b306a2b48070737b092b51e76de933baee9ad Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 24 Oct 2025 08:37:16 +0300 Subject: [PATCH 484/759] feat(api-nodes): network client v2: async ops, cancellation, downloads, refactor (#10390) * feat(api-nodes): implement new API client for V3 nodes * feat(api-nodes): implement new API client for V3 nodes * feat(api-nodes): implement new API client for V3 nodes * converted WAN nodes to use new client; polishing * fix(auth): do not leak authentification for the absolute urls * convert BFL API nodes to use new API client; remove deprecated BFL nodes * converted Google Veo nodes * fix(Veo3.1 model): take into account "generate_audio" parameter --- comfy_api_nodes/apinode_utils.py | 435 +--------- comfy_api_nodes/apis/bfl_api.py | 51 +- comfy_api_nodes/apis/veo_api.py | 111 +++ comfy_api_nodes/nodes_bfl.py | 605 ++++---------- comfy_api_nodes/nodes_bytedance.py | 279 ++----- comfy_api_nodes/nodes_gemini.py | 5 +- comfy_api_nodes/nodes_kling.py | 350 +++----- comfy_api_nodes/nodes_luma.py | 2 +- comfy_api_nodes/nodes_minimax.py | 2 +- comfy_api_nodes/nodes_moonvalley.py | 366 ++------- comfy_api_nodes/nodes_openai.py | 4 +- comfy_api_nodes/nodes_pika.py | 6 +- comfy_api_nodes/nodes_pixverse.py | 13 +- comfy_api_nodes/nodes_recraft.py | 4 +- comfy_api_nodes/nodes_runway.py | 199 ++--- comfy_api_nodes/nodes_sora.py | 74 +- comfy_api_nodes/nodes_stability.py | 8 +- comfy_api_nodes/nodes_veo2.py | 176 ++-- comfy_api_nodes/nodes_vidu.py | 131 +-- comfy_api_nodes/nodes_wan.py | 245 +++--- comfy_api_nodes/util/__init__.py | 87 ++ comfy_api_nodes/util/_helpers.py | 71 ++ comfy_api_nodes/util/client.py | 941 ++++++++++++++++++++++ comfy_api_nodes/util/common_exceptions.py | 14 + comfy_api_nodes/util/conversions.py | 407 ++++++++++ comfy_api_nodes/util/download_helpers.py | 249 ++++++ comfy_api_nodes/util/upload_helpers.py | 338 ++++++++ comfy_api_nodes/util/validation_utils.py | 58 +- pyproject.toml | 2 + 29 files changed, 2935 insertions(+), 2298 deletions(-) create mode 100644 comfy_api_nodes/apis/veo_api.py create mode 100644 comfy_api_nodes/util/_helpers.py create mode 100644 comfy_api_nodes/util/client.py create mode 100644 comfy_api_nodes/util/common_exceptions.py create mode 100644 comfy_api_nodes/util/conversions.py create mode 100644 comfy_api_nodes/util/download_helpers.py create mode 100644 comfy_api_nodes/util/upload_helpers.py diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py index bc3d2d07e..e3d282059 100644 --- a/comfy_api_nodes/apinode_utils.py +++ b/comfy_api_nodes/apinode_utils.py @@ -1,15 +1,10 @@ from __future__ import annotations import aiohttp -import io -import logging import mimetypes -import os from typing import Optional, Union from comfy.utils import common_upscale -from comfy_api.input_impl import VideoFromFile from comfy_api.util import VideoContainer, VideoCodec from comfy_api.input.video_types import VideoInput -from comfy_api.input.basic_types import AudioInput from comfy_api_nodes.apis.client import ( ApiClient, ApiEndpoint, @@ -26,43 +21,8 @@ from PIL import Image import torch import math import base64 -import uuid +from .util import tensor_to_bytesio, bytesio_to_image_tensor from io import BytesIO -import av - - -async def download_url_to_video_output( - video_url: str, timeout: int = None, auth_kwargs: Optional[dict[str, str]] = None -) -> VideoFromFile: - """Downloads a video from a URL and returns a `VIDEO` output. - - Args: - video_url: The URL of the video to download. - - Returns: - A Comfy node `VIDEO` output. - """ - video_io = await download_url_to_bytesio(video_url, timeout, auth_kwargs=auth_kwargs) - if video_io is None: - error_msg = f"Failed to download video from {video_url}" - logging.error(error_msg) - raise ValueError(error_msg) - return VideoFromFile(video_io) - - -def downscale_image_tensor(image, total_pixels=1536 * 1024) -> torch.Tensor: - """Downscale input image tensor to roughly the specified total pixels.""" - samples = image.movedim(-1, 1) - total = int(total_pixels) - scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) - if scale_by >= 1: - return image - width = round(samples.shape[3] * scale_by) - height = round(samples.shape[2] * scale_by) - - s = common_upscale(samples, width, height, "lanczos", "disabled") - s = s.movedim(1, -1) - return s async def validate_and_cast_response( @@ -162,11 +122,6 @@ def validate_aspect_ratio( return aspect_ratio -def mimetype_to_extension(mime_type: str) -> str: - """Converts a MIME type to a file extension.""" - return mime_type.split("/")[-1].lower() - - async def download_url_to_bytesio( url: str, timeout: int = None, auth_kwargs: Optional[dict[str, str]] = None ) -> BytesIO: @@ -195,136 +150,11 @@ async def download_url_to_bytesio( return BytesIO(await resp.read()) -def bytesio_to_image_tensor(image_bytesio: BytesIO, mode: str = "RGBA") -> torch.Tensor: - """Converts image data from BytesIO to a torch.Tensor. - - Args: - image_bytesio: BytesIO object containing the image data. - mode: The PIL mode to convert the image to (e.g., "RGB", "RGBA"). - - Returns: - A torch.Tensor representing the image (1, H, W, C). - - Raises: - PIL.UnidentifiedImageError: If the image data cannot be identified. - ValueError: If the specified mode is invalid. - """ - image = Image.open(image_bytesio) - image = image.convert(mode) - image_array = np.array(image).astype(np.float32) / 255.0 - return torch.from_numpy(image_array).unsqueeze(0) - - -async def download_url_to_image_tensor(url: str, timeout: int = None) -> torch.Tensor: - """Downloads an image from a URL and returns a [B, H, W, C] tensor.""" - image_bytesio = await download_url_to_bytesio(url, timeout) - return bytesio_to_image_tensor(image_bytesio) - - def process_image_response(response_content: bytes | str) -> torch.Tensor: """Uses content from a Response object and converts it to a torch.Tensor""" return bytesio_to_image_tensor(BytesIO(response_content)) -def _tensor_to_pil(image: torch.Tensor, total_pixels: int = 2048 * 2048) -> Image.Image: - """Converts a single torch.Tensor image [H, W, C] to a PIL Image, optionally downscaling.""" - if len(image.shape) > 3: - image = image[0] - # TODO: remove alpha if not allowed and present - input_tensor = image.cpu() - input_tensor = downscale_image_tensor( - input_tensor.unsqueeze(0), total_pixels=total_pixels - ).squeeze() - image_np = (input_tensor.numpy() * 255).astype(np.uint8) - img = Image.fromarray(image_np) - return img - - -def _pil_to_bytesio(img: Image.Image, mime_type: str = "image/png") -> BytesIO: - """Converts a PIL Image to a BytesIO object.""" - if not mime_type: - mime_type = "image/png" - - img_byte_arr = io.BytesIO() - # Derive PIL format from MIME type (e.g., 'image/png' -> 'PNG') - pil_format = mime_type.split("/")[-1].upper() - if pil_format == "JPG": - pil_format = "JPEG" - img.save(img_byte_arr, format=pil_format) - img_byte_arr.seek(0) - return img_byte_arr - - -def tensor_to_bytesio( - image: torch.Tensor, - name: Optional[str] = None, - total_pixels: int = 2048 * 2048, - mime_type: str = "image/png", -) -> BytesIO: - """Converts a torch.Tensor image to a named BytesIO object. - - Args: - image: Input torch.Tensor image. - name: Optional filename for the BytesIO object. - total_pixels: Maximum total pixels for potential downscaling. - mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4'). - - Returns: - Named BytesIO object containing the image data, with pointer set to the start of buffer. - """ - if not mime_type: - mime_type = "image/png" - - pil_image = _tensor_to_pil(image, total_pixels=total_pixels) - img_binary = _pil_to_bytesio(pil_image, mime_type=mime_type) - img_binary.name = ( - f"{name if name else uuid.uuid4()}.{mimetype_to_extension(mime_type)}" - ) - return img_binary - - -def tensor_to_base64_string( - image_tensor: torch.Tensor, - total_pixels: int = 2048 * 2048, - mime_type: str = "image/png", -) -> str: - """Convert [B, H, W, C] or [H, W, C] tensor to a base64 string. - - Args: - image_tensor: Input torch.Tensor image. - total_pixels: Maximum total pixels for potential downscaling. - mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4'). - - Returns: - Base64 encoded string of the image. - """ - pil_image = _tensor_to_pil(image_tensor, total_pixels=total_pixels) - img_byte_arr = _pil_to_bytesio(pil_image, mime_type=mime_type) - img_bytes = img_byte_arr.getvalue() - # Encode bytes to base64 string - base64_encoded_string = base64.b64encode(img_bytes).decode("utf-8") - return base64_encoded_string - - -def tensor_to_data_uri( - image_tensor: torch.Tensor, - total_pixels: int = 2048 * 2048, - mime_type: str = "image/png", -) -> str: - """Converts a tensor image to a Data URI string. - - Args: - image_tensor: Input torch.Tensor image. - total_pixels: Maximum total pixels for potential downscaling. - mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp'). - - Returns: - Data URI string (e.g., 'data:image/png;base64,...'). - """ - base64_string = tensor_to_base64_string(image_tensor, total_pixels, mime_type) - return f"data:{mime_type};base64,{base64_string}" - - def text_filepath_to_base64_string(filepath: str) -> str: """Converts a text file to a base64 string.""" with open(filepath, "rb") as f: @@ -392,7 +222,7 @@ def video_to_base64_string( container_format: Optional container format to use (defaults to video.container if available) codec: Optional codec to use (defaults to video.codec if available) """ - video_bytes_io = io.BytesIO() + video_bytes_io = BytesIO() # Use provided format/codec if specified, otherwise use video's own if available format_to_use = container_format if container_format is not None else getattr(video, 'container', VideoContainer.MP4) @@ -403,214 +233,6 @@ def video_to_base64_string( return base64.b64encode(video_bytes_io.getvalue()).decode("utf-8") -async def upload_video_to_comfyapi( - video: VideoInput, - auth_kwargs: Optional[dict[str, str]] = None, - container: VideoContainer = VideoContainer.MP4, - codec: VideoCodec = VideoCodec.H264, - max_duration: Optional[int] = None, -) -> str: - """ - Uploads a single video to ComfyUI API and returns its download URL. - Uses the specified container and codec for saving the video before upload. - - Args: - video: VideoInput object (Comfy VIDEO type). - auth_kwargs: Optional authentication token(s). - container: The video container format to use (default: MP4). - codec: The video codec to use (default: H264). - max_duration: Optional maximum duration of the video in seconds. If the video is longer than this, an error will be raised. - - Returns: - The download URL for the uploaded video file. - """ - if max_duration is not None: - try: - actual_duration = video.duration_seconds - if actual_duration is not None and actual_duration > max_duration: - raise ValueError( - f"Video duration ({actual_duration:.2f}s) exceeds the maximum allowed ({max_duration}s)." - ) - except Exception as e: - logging.error("Error getting video duration: %s", str(e)) - raise ValueError(f"Could not verify video duration from source: {e}") from e - - upload_mime_type = f"video/{container.value.lower()}" - filename = f"uploaded_video.{container.value.lower()}" - - # Convert VideoInput to BytesIO using specified container/codec - video_bytes_io = io.BytesIO() - video.save_to(video_bytes_io, format=container, codec=codec) - video_bytes_io.seek(0) - - return await upload_file_to_comfyapi(video_bytes_io, filename, upload_mime_type, auth_kwargs) - - -def audio_tensor_to_contiguous_ndarray(waveform: torch.Tensor) -> np.ndarray: - """ - Prepares audio waveform for av library by converting to a contiguous numpy array. - - Args: - waveform: a tensor of shape (1, channels, samples) derived from a Comfy `AUDIO` type. - - Returns: - Contiguous numpy array of the audio waveform. If the audio was batched, - the first item is taken. - """ - if waveform.ndim != 3 or waveform.shape[0] != 1: - raise ValueError("Expected waveform tensor shape (1, channels, samples)") - - # If batch is > 1, take first item - if waveform.shape[0] > 1: - waveform = waveform[0] - - # Prepare for av: remove batch dim, move to CPU, make contiguous, convert to numpy array - audio_data_np = waveform.squeeze(0).cpu().contiguous().numpy() - if audio_data_np.dtype != np.float32: - audio_data_np = audio_data_np.astype(np.float32) - - return audio_data_np - - -def audio_ndarray_to_bytesio( - audio_data_np: np.ndarray, - sample_rate: int, - container_format: str = "mp4", - codec_name: str = "aac", -) -> BytesIO: - """ - Encodes a numpy array of audio data into a BytesIO object. - """ - audio_bytes_io = io.BytesIO() - with av.open(audio_bytes_io, mode="w", format=container_format) as output_container: - audio_stream = output_container.add_stream(codec_name, rate=sample_rate) - frame = av.AudioFrame.from_ndarray( - audio_data_np, - format="fltp", - layout="stereo" if audio_data_np.shape[0] > 1 else "mono", - ) - frame.sample_rate = sample_rate - frame.pts = 0 - - for packet in audio_stream.encode(frame): - output_container.mux(packet) - - # Flush stream - for packet in audio_stream.encode(None): - output_container.mux(packet) - - audio_bytes_io.seek(0) - return audio_bytes_io - - -async def upload_audio_to_comfyapi( - audio: AudioInput, - auth_kwargs: Optional[dict[str, str]] = None, - container_format: str = "mp4", - codec_name: str = "aac", - mime_type: str = "audio/mp4", - filename: str = "uploaded_audio.mp4", -) -> str: - """ - Uploads a single audio input to ComfyUI API and returns its download URL. - Encodes the raw waveform into the specified format before uploading. - - Args: - audio: a Comfy `AUDIO` type (contains waveform tensor and sample_rate) - auth_kwargs: Optional authentication token(s). - - Returns: - The download URL for the uploaded audio file. - """ - sample_rate: int = audio["sample_rate"] - waveform: torch.Tensor = audio["waveform"] - audio_data_np = audio_tensor_to_contiguous_ndarray(waveform) - audio_bytes_io = audio_ndarray_to_bytesio( - audio_data_np, sample_rate, container_format, codec_name - ) - - return await upload_file_to_comfyapi(audio_bytes_io, filename, mime_type, auth_kwargs) - - -def f32_pcm(wav: torch.Tensor) -> torch.Tensor: - """Convert audio to float 32 bits PCM format. Copy-paste from nodes_audio.py file.""" - if wav.dtype.is_floating_point: - return wav - elif wav.dtype == torch.int16: - return wav.float() / (2 ** 15) - elif wav.dtype == torch.int32: - return wav.float() / (2 ** 31) - raise ValueError(f"Unsupported wav dtype: {wav.dtype}") - - -def audio_bytes_to_audio_input(audio_bytes: bytes,) -> dict: - """ - Decode any common audio container from bytes using PyAV and return - a Comfy AUDIO dict: {"waveform": [1, C, T] float32, "sample_rate": int}. - """ - with av.open(io.BytesIO(audio_bytes)) as af: - if not af.streams.audio: - raise ValueError("No audio stream found in response.") - stream = af.streams.audio[0] - - in_sr = int(stream.codec_context.sample_rate) - out_sr = in_sr - - frames: list[torch.Tensor] = [] - n_channels = stream.channels or 1 - - for frame in af.decode(streams=stream.index): - arr = frame.to_ndarray() # shape can be [C, T] or [T, C] or [T] - buf = torch.from_numpy(arr) - if buf.ndim == 1: - buf = buf.unsqueeze(0) # [T] -> [1, T] - elif buf.shape[0] != n_channels and buf.shape[-1] == n_channels: - buf = buf.transpose(0, 1).contiguous() # [T, C] -> [C, T] - elif buf.shape[0] != n_channels: - buf = buf.reshape(-1, n_channels).t().contiguous() # fallback to [C, T] - frames.append(buf) - - if not frames: - raise ValueError("Decoded zero audio frames.") - - wav = torch.cat(frames, dim=1) # [C, T] - wav = f32_pcm(wav) - return {"waveform": wav.unsqueeze(0).contiguous(), "sample_rate": out_sr} - - -def audio_input_to_mp3(audio: AudioInput) -> io.BytesIO: - waveform = audio["waveform"].cpu() - - output_buffer = io.BytesIO() - output_container = av.open(output_buffer, mode='w', format="mp3") - - out_stream = output_container.add_stream("libmp3lame", rate=audio["sample_rate"]) - out_stream.bit_rate = 320000 - - frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[0] == 1 else 'stereo') - frame.sample_rate = audio["sample_rate"] - frame.pts = 0 - output_container.mux(out_stream.encode(frame)) - output_container.mux(out_stream.encode(None)) - output_container.close() - output_buffer.seek(0) - return output_buffer - - -def audio_to_base64_string( - audio: AudioInput, container_format: str = "mp4", codec_name: str = "aac" -) -> str: - """Converts an audio input to a base64 string.""" - sample_rate: int = audio["sample_rate"] - waveform: torch.Tensor = audio["waveform"] - audio_data_np = audio_tensor_to_contiguous_ndarray(waveform) - audio_bytes_io = audio_ndarray_to_bytesio( - audio_data_np, sample_rate, container_format, codec_name - ) - audio_bytes = audio_bytes_io.getvalue() - return base64.b64encode(audio_bytes).decode("utf-8") - - async def upload_images_to_comfyapi( image: torch.Tensor, max_images=8, @@ -663,56 +285,3 @@ def resize_mask_to_image( if not allow_gradient: mask = (mask > 0.5).float() return mask - - -def validate_string( - string: str, - strip_whitespace=True, - field_name="prompt", - min_length=None, - max_length=None, -): - if string is None: - raise Exception(f"Field '{field_name}' cannot be empty.") - if strip_whitespace: - string = string.strip() - if min_length and len(string) < min_length: - raise Exception( - f"Field '{field_name}' cannot be shorter than {min_length} characters; was {len(string)} characters long." - ) - if max_length and len(string) > max_length: - raise Exception( - f" Field '{field_name} cannot be longer than {max_length} characters; was {len(string)} characters long." - ) - - -def image_tensor_pair_to_batch( - image1: torch.Tensor, image2: torch.Tensor -) -> torch.Tensor: - """ - Converts a pair of image tensors to a batch tensor. - If the images are not the same size, the smaller image is resized to - match the larger image. - """ - if image1.shape[1:] != image2.shape[1:]: - image2 = common_upscale( - image2.movedim(-1, 1), - image1.shape[2], - image1.shape[1], - "bilinear", - "center", - ).movedim(1, -1) - return torch.cat((image1, image2), dim=0) - - -def get_size(path_or_object: Union[str, io.BytesIO]) -> int: - if isinstance(path_or_object, str): - return os.path.getsize(path_or_object) - return len(path_or_object.getvalue()) - - -def validate_container_format_is_mp4(video: VideoInput) -> None: - """Validates video container format is MP4.""" - container_format = video.get_container_format() - if container_format not in ["mp4", "mov,mp4,m4a,3gp,3g2,mj2"]: - raise ValueError(f"Only MP4 container format supported. Got: {container_format}") diff --git a/comfy_api_nodes/apis/bfl_api.py b/comfy_api_nodes/apis/bfl_api.py index 0e90aef7c..0fc8c0607 100644 --- a/comfy_api_nodes/apis/bfl_api.py +++ b/comfy_api_nodes/apis/bfl_api.py @@ -50,44 +50,6 @@ class BFLFluxFillImageRequest(BaseModel): mask: str = Field(None, description='A Base64-encoded string representing the mask of the areas you with to modify.') -class BFLFluxCannyImageRequest(BaseModel): - prompt: str = Field(..., description='Text prompt for image generation') - prompt_upsampling: Optional[bool] = Field( - None, description='Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation.' - ) - canny_low_threshold: Optional[int] = Field(None, description='Low threshold for Canny edge detection') - canny_high_threshold: Optional[int] = Field(None, description='High threshold for Canny edge detection') - seed: Optional[int] = Field(None, description='The seed value for reproducibility.') - steps: conint(ge=15, le=50) = Field(..., description='Number of steps for the image generation process') - guidance: confloat(ge=1, le=100) = Field(..., description='Guidance strength for the image generation process') - safety_tolerance: Optional[conint(ge=0, le=6)] = Field( - 6, description='Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict. Defaults to 2.' - ) - output_format: Optional[BFLOutputFormat] = Field( - BFLOutputFormat.png, description="Output format for the generated image. Can be 'jpeg' or 'png'.", examples=['png'] - ) - control_image: Optional[str] = Field(None, description='Base64 encoded image to use as control input if no preprocessed image is provided') - preprocessed_image: Optional[str] = Field(None, description='Optional pre-processed image that will bypass the control preprocessing step') - - -class BFLFluxDepthImageRequest(BaseModel): - prompt: str = Field(..., description='Text prompt for image generation') - prompt_upsampling: Optional[bool] = Field( - None, description='Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation.' - ) - seed: Optional[int] = Field(None, description='The seed value for reproducibility.') - steps: conint(ge=15, le=50) = Field(..., description='Number of steps for the image generation process') - guidance: confloat(ge=1, le=100) = Field(..., description='Guidance strength for the image generation process') - safety_tolerance: Optional[conint(ge=0, le=6)] = Field( - 6, description='Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict. Defaults to 2.' - ) - output_format: Optional[BFLOutputFormat] = Field( - BFLOutputFormat.png, description="Output format for the generated image. Can be 'jpeg' or 'png'.", examples=['png'] - ) - control_image: Optional[str] = Field(None, description='Base64 encoded image to use as control input if no preprocessed image is provided') - preprocessed_image: Optional[str] = Field(None, description='Optional pre-processed image that will bypass the control preprocessing step') - - class BFLFluxProGenerateRequest(BaseModel): prompt: str = Field(..., description='The text prompt for image generation.') prompt_upsampling: Optional[bool] = Field( @@ -160,15 +122,8 @@ class BFLStatus(str, Enum): error = "Error" -class BFLFluxProStatusResponse(BaseModel): +class BFLFluxStatusResponse(BaseModel): id: str = Field(..., description="The unique identifier for the generation task.") status: BFLStatus = Field(..., description="The status of the task.") - result: Optional[Dict[str, Any]] = Field( - None, description="The result of the task (null if not completed)." - ) - progress: confloat(ge=0.0, le=1.0) = Field( - ..., description="The progress of the task (0.0 to 1.0)." - ) - details: Optional[Dict[str, Any]] = Field( - None, description="Additional details about the task (null if not available)." - ) + result: Optional[Dict[str, Any]] = Field(None, description="The result of the task (null if not completed).") + progress: Optional[float] = Field(None, description="The progress of the task (0.0 to 1.0).", ge=0.0, le=1.0) diff --git a/comfy_api_nodes/apis/veo_api.py b/comfy_api_nodes/apis/veo_api.py new file mode 100644 index 000000000..a55137afb --- /dev/null +++ b/comfy_api_nodes/apis/veo_api.py @@ -0,0 +1,111 @@ +from typing import Optional, Union +from enum import Enum + +from pydantic import BaseModel, Field + + +class Image2(BaseModel): + bytesBase64Encoded: str + gcsUri: Optional[str] = None + mimeType: Optional[str] = None + + +class Image3(BaseModel): + bytesBase64Encoded: Optional[str] = None + gcsUri: str + mimeType: Optional[str] = None + + +class Instance1(BaseModel): + image: Optional[Union[Image2, Image3]] = Field( + None, description='Optional image to guide video generation' + ) + prompt: str = Field(..., description='Text description of the video') + + +class PersonGeneration1(str, Enum): + ALLOW = 'ALLOW' + BLOCK = 'BLOCK' + + +class Parameters1(BaseModel): + aspectRatio: Optional[str] = Field(None, examples=['16:9']) + durationSeconds: Optional[int] = None + enhancePrompt: Optional[bool] = None + generateAudio: Optional[bool] = Field( + None, + description='Generate audio for the video. Only supported by veo 3 models.', + ) + negativePrompt: Optional[str] = None + personGeneration: Optional[PersonGeneration1] = None + sampleCount: Optional[int] = None + seed: Optional[int] = None + storageUri: Optional[str] = Field( + None, description='Optional Cloud Storage URI to upload the video' + ) + + +class VeoGenVidRequest(BaseModel): + instances: Optional[list[Instance1]] = None + parameters: Optional[Parameters1] = None + + +class VeoGenVidResponse(BaseModel): + name: str = Field( + ..., + description='Operation resource name', + examples=[ + 'projects/PROJECT_ID/locations/us-central1/publishers/google/models/MODEL_ID/operations/a1b07c8e-7b5a-4aba-bb34-3e1ccb8afcc8' + ], + ) + + +class VeoGenVidPollRequest(BaseModel): + operationName: str = Field( + ..., + description='Full operation name (from predict response)', + examples=[ + 'projects/PROJECT_ID/locations/us-central1/publishers/google/models/MODEL_ID/operations/OPERATION_ID' + ], + ) + + +class Video(BaseModel): + bytesBase64Encoded: Optional[str] = Field( + None, description='Base64-encoded video content' + ) + gcsUri: Optional[str] = Field(None, description='Cloud Storage URI of the video') + mimeType: Optional[str] = Field(None, description='Video MIME type') + + +class Error1(BaseModel): + code: Optional[int] = Field(None, description='Error code') + message: Optional[str] = Field(None, description='Error message') + + +class Response1(BaseModel): + field_type: Optional[str] = Field( + None, + alias='@type', + examples=[ + 'type.googleapis.com/cloud.ai.large_models.vision.GenerateVideoResponse' + ], + ) + raiMediaFilteredCount: Optional[int] = Field( + None, description='Count of media filtered by responsible AI policies' + ) + raiMediaFilteredReasons: Optional[list[str]] = Field( + None, description='Reasons why media was filtered by responsible AI policies' + ) + videos: Optional[list[Video]] = None + + +class VeoGenVidPollResponse(BaseModel): + done: Optional[bool] = None + error: Optional[Error1] = Field( + None, description='Error details if operation failed' + ) + name: Optional[str] = None + response: Optional[Response1] = Field( + None, description='The actual prediction response if done is true' + ) diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py index b6cc90f05..baa74fd52 100644 --- a/comfy_api_nodes/nodes_bfl.py +++ b/comfy_api_nodes/nodes_bfl.py @@ -1,136 +1,43 @@ -import asyncio -import io from inspect import cleandoc -from typing import Union, Optional +from typing import Optional + +import torch from typing_extensions import override -from comfy_api.latest import ComfyExtension, IO + +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.apinode_utils import ( + resize_mask_to_image, + validate_aspect_ratio, +) from comfy_api_nodes.apis.bfl_api import ( - BFLStatus, BFLFluxExpandImageRequest, BFLFluxFillImageRequest, - BFLFluxCannyImageRequest, - BFLFluxDepthImageRequest, - BFLFluxProGenerateRequest, BFLFluxKontextProGenerateRequest, - BFLFluxProUltraGenerateRequest, + BFLFluxProGenerateRequest, BFLFluxProGenerateResponse, + BFLFluxProUltraGenerateRequest, + BFLFluxStatusResponse, + BFLStatus, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, -) -from comfy_api_nodes.apinode_utils import ( - downscale_image_tensor, - validate_aspect_ratio, - process_image_response, - resize_mask_to_image, + download_url_to_image_tensor, + poll_op, + sync_op, + tensor_to_base64_string, validate_string, ) -import numpy as np -from PIL import Image -import aiohttp -import torch -import base64 -import time -from server import PromptServer - def convert_mask_to_image(mask: torch.Tensor): """ Make mask have the expected amount of dims (4) and channels (3) to be recognized as an image. """ mask = mask.unsqueeze(-1) - mask = torch.cat([mask]*3, dim=-1) + mask = torch.cat([mask] * 3, dim=-1) return mask -async def handle_bfl_synchronous_operation( - operation: SynchronousOperation, - timeout_bfl_calls=360, - node_id: Union[str, None] = None, -): - response_api: BFLFluxProGenerateResponse = await operation.execute() - return await _poll_until_generated( - response_api.polling_url, timeout=timeout_bfl_calls, node_id=node_id - ) - - -async def _poll_until_generated( - polling_url: str, timeout=360, node_id: Union[str, None] = None -): - # used bfl-comfy-nodes to verify code implementation: - # https://github.com/black-forest-labs/bfl-comfy-nodes/tree/main - start_time = time.time() - retries_404 = 0 - max_retries_404 = 5 - retry_404_seconds = 2 - retry_202_seconds = 2 - retry_pending_seconds = 1 - - async with aiohttp.ClientSession() as session: - # NOTE: should True loop be replaced with checking if workflow has been interrupted? - while True: - if node_id: - time_elapsed = time.time() - start_time - PromptServer.instance.send_progress_text( - f"Generating ({time_elapsed:.0f}s)", node_id - ) - - async with session.get(polling_url) as response: - if response.status == 200: - result = await response.json() - if result["status"] == BFLStatus.ready: - img_url = result["result"]["sample"] - if node_id: - PromptServer.instance.send_progress_text( - f"Result URL: {img_url}", node_id - ) - async with session.get(img_url) as img_resp: - return process_image_response(await img_resp.content.read()) - elif result["status"] in [ - BFLStatus.request_moderated, - BFLStatus.content_moderated, - ]: - status = result["status"] - raise Exception( - f"BFL API did not return an image due to: {status}." - ) - elif result["status"] == BFLStatus.error: - raise Exception(f"BFL API encountered an error: {result}.") - elif result["status"] == BFLStatus.pending: - await asyncio.sleep(retry_pending_seconds) - continue - elif response.status == 404: - if retries_404 < max_retries_404: - retries_404 += 1 - await asyncio.sleep(retry_404_seconds) - continue - raise Exception( - f"BFL API could not find task after {max_retries_404} tries." - ) - elif response.status == 202: - await asyncio.sleep(retry_202_seconds) - elif time.time() - start_time > timeout: - raise Exception( - f"BFL API experienced a timeout; could not return request under {timeout} seconds." - ) - else: - raise Exception(f"BFL API encountered an error: {response.json()}") - -def convert_image_to_base64(image: torch.Tensor): - scaled_image = downscale_image_tensor(image, total_pixels=2048 * 2048) - # remove batch dimension if present - if len(scaled_image.shape) > 3: - scaled_image = scaled_image[0] - image_np = (scaled_image.numpy() * 255).astype(np.uint8) - img = Image.fromarray(image_np) - img_byte_arr = io.BytesIO() - img.save(img_byte_arr, format="PNG") - return base64.b64encode(img_byte_arr.getvalue()).decode() - - class FluxProUltraImageNode(IO.ComfyNode): """ Generates images using Flux Pro 1.1 Ultra via api based on prompt and resolution. @@ -158,7 +65,9 @@ class FluxProUltraImageNode(IO.ComfyNode): IO.Boolean.Input( "prompt_upsampling", default=False, - tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", + tooltip="Whether to perform upsampling on the prompt. " + "If active, automatically modifies the prompt for more creative generation, " + "but results are nondeterministic (same seed will not produce exactly the same result).", ), IO.Int.Input( "seed", @@ -220,22 +129,19 @@ class FluxProUltraImageNode(IO.ComfyNode): cls, prompt: str, aspect_ratio: str, - prompt_upsampling=False, - raw=False, - seed=0, - image_prompt=None, - image_prompt_strength=0.1, + prompt_upsampling: bool = False, + raw: bool = False, + seed: int = 0, + image_prompt: Optional[torch.Tensor] = None, + image_prompt_strength: float = 0.1, ) -> IO.NodeOutput: if image_prompt is None: validate_string(prompt, strip_whitespace=False) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/bfl/flux-pro-1.1-ultra/generate", - method=HttpMethod.POST, - request_model=BFLFluxProUltraGenerateRequest, - response_model=BFLFluxProGenerateResponse, - ), - request=BFLFluxProUltraGenerateRequest( + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/bfl/flux-pro-1.1-ultra/generate", method="POST"), + response_model=BFLFluxProGenerateResponse, + data=BFLFluxProUltraGenerateRequest( prompt=prompt, prompt_upsampling=prompt_upsampling, seed=seed, @@ -247,22 +153,26 @@ class FluxProUltraImageNode(IO.ComfyNode): maximum_ratio_str=cls.MAXIMUM_RATIO_STR, ), raw=raw, - image_prompt=( - image_prompt - if image_prompt is None - else convert_image_to_base64(image_prompt) - ), - image_prompt_strength=( - None if image_prompt is None else round(image_prompt_strength, 2) - ), + image_prompt=(image_prompt if image_prompt is None else tensor_to_base64_string(image_prompt)), + image_prompt_strength=(None if image_prompt is None else round(image_prompt_strength, 2)), ), - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return IO.NodeOutput(output_image) + response = await poll_op( + cls, + ApiEndpoint(initial_response.polling_url), + response_model=BFLFluxStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + completed_statuses=[BFLStatus.ready], + failed_statuses=[ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + BFLStatus.error, + BFLStatus.task_not_found, + ], + queued_statuses=[], + ) + return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) class FluxKontextProImageNode(IO.ComfyNode): @@ -347,7 +257,7 @@ class FluxKontextProImageNode(IO.ComfyNode): aspect_ratio: str, guidance: float, steps: int, - input_image: Optional[torch.Tensor]=None, + input_image: Optional[torch.Tensor] = None, seed=0, prompt_upsampling=False, ) -> IO.NodeOutput: @@ -360,33 +270,36 @@ class FluxKontextProImageNode(IO.ComfyNode): ) if input_image is None: validate_string(prompt, strip_whitespace=False) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=cls.BFL_PATH, - method=HttpMethod.POST, - request_model=BFLFluxKontextProGenerateRequest, - response_model=BFLFluxProGenerateResponse, - ), - request=BFLFluxKontextProGenerateRequest( + initial_response = await sync_op( + cls, + ApiEndpoint(path=cls.BFL_PATH, method="POST"), + response_model=BFLFluxProGenerateResponse, + data=BFLFluxKontextProGenerateRequest( prompt=prompt, prompt_upsampling=prompt_upsampling, guidance=round(guidance, 1), steps=steps, seed=seed, aspect_ratio=aspect_ratio, - input_image=( - input_image - if input_image is None - else convert_image_to_base64(input_image) - ) + input_image=(input_image if input_image is None else tensor_to_base64_string(input_image)), ), - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return IO.NodeOutput(output_image) + response = await poll_op( + cls, + ApiEndpoint(initial_response.polling_url), + response_model=BFLFluxStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + completed_statuses=[BFLStatus.ready], + failed_statuses=[ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + BFLStatus.error, + BFLStatus.task_not_found, + ], + queued_statuses=[], + ) + return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) class FluxKontextMaxImageNode(FluxKontextProImageNode): @@ -422,7 +335,9 @@ class FluxProImageNode(IO.ComfyNode): IO.Boolean.Input( "prompt_upsampling", default=False, - tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", + tooltip="Whether to perform upsampling on the prompt. " + "If active, automatically modifies the prompt for more creative generation, " + "but results are nondeterministic (same seed will not produce exactly the same result).", ), IO.Int.Input( "width", @@ -481,20 +396,15 @@ class FluxProImageNode(IO.ComfyNode): image_prompt=None, # image_prompt_strength=0.1, ) -> IO.NodeOutput: - image_prompt = ( - image_prompt - if image_prompt is None - else convert_image_to_base64(image_prompt) - ) - - operation = SynchronousOperation( - endpoint=ApiEndpoint( + image_prompt = image_prompt if image_prompt is None else tensor_to_base64_string(image_prompt) + initial_response = await sync_op( + cls, + ApiEndpoint( path="/proxy/bfl/flux-pro-1.1/generate", - method=HttpMethod.POST, - request_model=BFLFluxProGenerateRequest, - response_model=BFLFluxProGenerateResponse, + method="POST", ), - request=BFLFluxProGenerateRequest( + response_model=BFLFluxProGenerateResponse, + data=BFLFluxProGenerateRequest( prompt=prompt, prompt_upsampling=prompt_upsampling, width=width, @@ -502,13 +412,23 @@ class FluxProImageNode(IO.ComfyNode): seed=seed, image_prompt=image_prompt, ), - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return IO.NodeOutput(output_image) + response = await poll_op( + cls, + ApiEndpoint(initial_response.polling_url), + response_model=BFLFluxStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + completed_statuses=[BFLStatus.ready], + failed_statuses=[ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + BFLStatus.error, + BFLStatus.task_not_found, + ], + queued_statuses=[], + ) + return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) class FluxProExpandNode(IO.ComfyNode): @@ -534,7 +454,9 @@ class FluxProExpandNode(IO.ComfyNode): IO.Boolean.Input( "prompt_upsampling", default=False, - tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", + tooltip="Whether to perform upsampling on the prompt. " + "If active, automatically modifies the prompt for more creative generation, " + "but results are nondeterministic (same seed will not produce exactly the same result).", ), IO.Int.Input( "top", @@ -610,16 +532,11 @@ class FluxProExpandNode(IO.ComfyNode): guidance: float, seed=0, ) -> IO.NodeOutput: - image = convert_image_to_base64(image) - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/bfl/flux-pro-1.0-expand/generate", - method=HttpMethod.POST, - request_model=BFLFluxExpandImageRequest, - response_model=BFLFluxProGenerateResponse, - ), - request=BFLFluxExpandImageRequest( + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/bfl/flux-pro-1.0-expand/generate", method="POST"), + response_model=BFLFluxProGenerateResponse, + data=BFLFluxExpandImageRequest( prompt=prompt, prompt_upsampling=prompt_upsampling, top=top, @@ -629,16 +546,25 @@ class FluxProExpandNode(IO.ComfyNode): steps=steps, guidance=guidance, seed=seed, - image=image, + image=tensor_to_base64_string(image), ), - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return IO.NodeOutput(output_image) - + response = await poll_op( + cls, + ApiEndpoint(initial_response.polling_url), + response_model=BFLFluxStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + completed_statuses=[BFLStatus.ready], + failed_statuses=[ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + BFLStatus.error, + BFLStatus.task_not_found, + ], + queued_statuses=[], + ) + return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) class FluxProFillNode(IO.ComfyNode): @@ -665,7 +591,9 @@ class FluxProFillNode(IO.ComfyNode): IO.Boolean.Input( "prompt_upsampling", default=False, - tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", + tooltip="Whether to perform upsampling on the prompt. " + "If active, automatically modifies the prompt for more creative generation, " + "but results are nondeterministic (same seed will not produce exactly the same result).", ), IO.Float.Input( "guidance", @@ -712,272 +640,37 @@ class FluxProFillNode(IO.ComfyNode): ) -> IO.NodeOutput: # prepare mask mask = resize_mask_to_image(mask, image) - mask = convert_image_to_base64(convert_mask_to_image(mask)) - # make sure image will have alpha channel removed - image = convert_image_to_base64(image[:, :, :, :3]) - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/bfl/flux-pro-1.0-fill/generate", - method=HttpMethod.POST, - request_model=BFLFluxFillImageRequest, - response_model=BFLFluxProGenerateResponse, - ), - request=BFLFluxFillImageRequest( + mask = tensor_to_base64_string(convert_mask_to_image(mask)) + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/bfl/flux-pro-1.0-fill/generate", method="POST"), + response_model=BFLFluxProGenerateResponse, + data=BFLFluxFillImageRequest( prompt=prompt, prompt_upsampling=prompt_upsampling, steps=steps, guidance=guidance, seed=seed, - image=image, + image=tensor_to_base64_string(image[:, :, :, :3]), # make sure image will have alpha channel removed mask=mask, ), - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return IO.NodeOutput(output_image) - - -class FluxProCannyNode(IO.ComfyNode): - """ - Generate image using a control image (canny). - """ - - @classmethod - def define_schema(cls) -> IO.Schema: - return IO.Schema( - node_id="FluxProCannyNode", - display_name="Flux.1 Canny Control Image", - category="api node/image/BFL", - description=cleandoc(cls.__doc__ or ""), - inputs=[ - IO.Image.Input("control_image"), - IO.String.Input( - "prompt", - multiline=True, - default="", - tooltip="Prompt for the image generation", - ), - IO.Boolean.Input( - "prompt_upsampling", - default=False, - tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - ), - IO.Float.Input( - "canny_low_threshold", - default=0.1, - min=0.01, - max=0.99, - step=0.01, - tooltip="Low threshold for Canny edge detection; ignored if skip_processing is True", - ), - IO.Float.Input( - "canny_high_threshold", - default=0.4, - min=0.01, - max=0.99, - step=0.01, - tooltip="High threshold for Canny edge detection; ignored if skip_processing is True", - ), - IO.Boolean.Input( - "skip_preprocessing", - default=False, - tooltip="Whether to skip preprocessing; set to True if control_image already is canny-fied, False if it is a raw image.", - ), - IO.Float.Input( - "guidance", - default=30, - min=1, - max=100, - tooltip="Guidance strength for the image generation process", - ), - IO.Int.Input( - "steps", - default=50, - min=15, - max=50, - tooltip="Number of steps for the image generation process", - ), - IO.Int.Input( - "seed", - default=0, - min=0, - max=0xFFFFFFFFFFFFFFFF, - control_after_generate=True, - tooltip="The random seed used for creating the noise.", - ), + response = await poll_op( + cls, + ApiEndpoint(initial_response.polling_url), + response_model=BFLFluxStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + completed_statuses=[BFLStatus.ready], + failed_statuses=[ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + BFLStatus.error, + BFLStatus.task_not_found, ], - outputs=[IO.Image.Output()], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], - is_api_node=True, + queued_statuses=[], ) - - @classmethod - async def execute( - cls, - control_image: torch.Tensor, - prompt: str, - prompt_upsampling: bool, - canny_low_threshold: float, - canny_high_threshold: float, - skip_preprocessing: bool, - steps: int, - guidance: float, - seed=0, - ) -> IO.NodeOutput: - control_image = convert_image_to_base64(control_image[:, :, :, :3]) - preprocessed_image = None - - # scale canny threshold between 0-500, to match BFL's API - def scale_value(value: float, min_val=0, max_val=500): - return min_val + value * (max_val - min_val) - canny_low_threshold = int(round(scale_value(canny_low_threshold))) - canny_high_threshold = int(round(scale_value(canny_high_threshold))) - - - if skip_preprocessing: - preprocessed_image = control_image - control_image = None - canny_low_threshold = None - canny_high_threshold = None - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/bfl/flux-pro-1.0-canny/generate", - method=HttpMethod.POST, - request_model=BFLFluxCannyImageRequest, - response_model=BFLFluxProGenerateResponse, - ), - request=BFLFluxCannyImageRequest( - prompt=prompt, - prompt_upsampling=prompt_upsampling, - steps=steps, - guidance=guidance, - seed=seed, - control_image=control_image, - canny_low_threshold=canny_low_threshold, - canny_high_threshold=canny_high_threshold, - preprocessed_image=preprocessed_image, - ), - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return IO.NodeOutput(output_image) - - -class FluxProDepthNode(IO.ComfyNode): - """ - Generate image using a control image (depth). - """ - - @classmethod - def define_schema(cls) -> IO.Schema: - return IO.Schema( - node_id="FluxProDepthNode", - display_name="Flux.1 Depth Control Image", - category="api node/image/BFL", - description=cleandoc(cls.__doc__ or ""), - inputs=[ - IO.Image.Input("control_image"), - IO.String.Input( - "prompt", - multiline=True, - default="", - tooltip="Prompt for the image generation", - ), - IO.Boolean.Input( - "prompt_upsampling", - default=False, - tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - ), - IO.Boolean.Input( - "skip_preprocessing", - default=False, - tooltip="Whether to skip preprocessing; set to True if control_image already is depth-ified, False if it is a raw image.", - ), - IO.Float.Input( - "guidance", - default=15, - min=1, - max=100, - tooltip="Guidance strength for the image generation process", - ), - IO.Int.Input( - "steps", - default=50, - min=15, - max=50, - tooltip="Number of steps for the image generation process", - ), - IO.Int.Input( - "seed", - default=0, - min=0, - max=0xFFFFFFFFFFFFFFFF, - control_after_generate=True, - tooltip="The random seed used for creating the noise.", - ), - ], - outputs=[IO.Image.Output()], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], - is_api_node=True, - ) - - @classmethod - async def execute( - cls, - control_image: torch.Tensor, - prompt: str, - prompt_upsampling: bool, - skip_preprocessing: bool, - steps: int, - guidance: float, - seed=0, - ) -> IO.NodeOutput: - control_image = convert_image_to_base64(control_image[:,:,:,:3]) - preprocessed_image = None - - if skip_preprocessing: - preprocessed_image = control_image - control_image = None - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/bfl/flux-pro-1.0-depth/generate", - method=HttpMethod.POST, - request_model=BFLFluxDepthImageRequest, - response_model=BFLFluxProGenerateResponse, - ), - request=BFLFluxDepthImageRequest( - prompt=prompt, - prompt_upsampling=prompt_upsampling, - steps=steps, - guidance=guidance, - seed=seed, - control_image=control_image, - preprocessed_image=preprocessed_image, - ), - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=cls.hidden.unique_id) - return IO.NodeOutput(output_image) + return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) class BFLExtension(ComfyExtension): @@ -990,8 +683,6 @@ class BFLExtension(ComfyExtension): FluxKontextMaxImageNode, FluxProExpandNode, FluxProFillNode, - FluxProCannyNode, - FluxProDepthNode, ] diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index f3d3f8d3e..534af380d 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -1,35 +1,27 @@ import logging import math from enum import Enum -from typing import Literal, Optional, Type, Union -from typing_extensions import override +from typing import Literal, Optional, Union import torch from pydantic import BaseModel, Field +from typing_extensions import override -from comfy_api.latest import ComfyExtension, IO -from comfy_api_nodes.util.validation_utils import ( - validate_image_aspect_ratio_range, - get_number_of_images, - validate_image_dimensions, -) -from comfy_api_nodes.apis.client import ( +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.util import ( ApiEndpoint, - EmptyRequest, - HttpMethod, - SynchronousOperation, - PollingOperation, - T, -) -from comfy_api_nodes.apinode_utils import ( download_url_to_image_tensor, download_url_to_video_output, - upload_images_to_comfyapi, - validate_string, + get_number_of_images, image_tensor_pair_to_batch, + poll_op, + sync_op, + upload_images_to_comfyapi, + validate_image_aspect_ratio_range, + validate_image_dimensions, + validate_string, ) - BYTEPLUS_IMAGE_ENDPOINT = "/proxy/byteplus/api/v3/images/generations" # Long-running tasks endpoints(e.g., video) @@ -46,13 +38,14 @@ class Image2ImageModelName(str, Enum): class Text2VideoModelName(str, Enum): - seedance_1_pro = "seedance-1-0-pro-250528" + seedance_1_pro = "seedance-1-0-pro-250528" seedance_1_lite = "seedance-1-0-lite-t2v-250428" class Image2VideoModelName(str, Enum): """note(August 31): Pro model only supports FirstFrame: https://docs.byteplus.com/en/docs/ModelArk/1520757""" - seedance_1_pro = "seedance-1-0-pro-250528" + + seedance_1_pro = "seedance-1-0-pro-250528" seedance_1_lite = "seedance-1-0-lite-i2v-250428" @@ -208,35 +201,6 @@ def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, N return None -async def poll_until_finished( - auth_kwargs: dict[str, str], - task_id: str, - estimated_duration: Optional[int] = None, - node_id: Optional[str] = None, -) -> TaskStatusResponse: - """Polls the ByteDance API endpoint until the task reaches a terminal state, then returns the response.""" - return await PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"{BYTEPLUS_TASK_STATUS_ENDPOINT}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TaskStatusResponse, - ), - completed_statuses=[ - "succeeded", - ], - failed_statuses=[ - "cancelled", - "failed", - ], - status_extractor=lambda response: response.status, - auth_kwargs=auth_kwargs, - result_url_extractor=get_video_url_from_task_status, - estimated_duration=estimated_duration, - node_id=node_id, - ).execute() - - class ByteDanceImageNode(IO.ComfyNode): @classmethod @@ -303,7 +267,7 @@ class ByteDanceImageNode(IO.ComfyNode): IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the image", + tooltip='Whether to add an "AI generated" watermark to the image', optional=True, ), ], @@ -341,8 +305,7 @@ class ByteDanceImageNode(IO.ComfyNode): w, h = width, height if not (512 <= w <= 2048) or not (512 <= h <= 2048): raise ValueError( - f"Custom size out of range: {w}x{h}. " - "Both width and height must be between 512 and 2048 pixels." + f"Custom size out of range: {w}x{h}. " "Both width and height must be between 512 and 2048 pixels." ) payload = Text2ImageTaskCreationRequest( @@ -353,20 +316,12 @@ class ByteDanceImageNode(IO.ComfyNode): guidance_scale=guidance_scale, watermark=watermark, ) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=BYTEPLUS_IMAGE_ENDPOINT, - method=HttpMethod.POST, - request_model=Text2ImageTaskCreationRequest, - response_model=ImageTaskCreationResponse, - ), - request=payload, - auth_kwargs=auth_kwargs, - ).execute() + response = await sync_op( + cls, + ApiEndpoint(path=BYTEPLUS_IMAGE_ENDPOINT, method="POST"), + data=payload, + response_model=ImageTaskCreationResponse, + ) return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) @@ -420,7 +375,7 @@ class ByteDanceImageEditNode(IO.ComfyNode): IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the image", + tooltip='Whether to add an "AI generated" watermark to the image', optional=True, ), ], @@ -449,16 +404,7 @@ class ByteDanceImageEditNode(IO.ComfyNode): if get_number_of_images(image) != 1: raise ValueError("Exactly one input image is required.") validate_image_aspect_ratio_range(image, (1, 3), (3, 1)) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - source_url = (await upload_images_to_comfyapi( - image, - max_images=1, - mime_type="image/png", - auth_kwargs=auth_kwargs, - ))[0] + source_url = (await upload_images_to_comfyapi(cls, image, max_images=1, mime_type="image/png"))[0] payload = Image2ImageTaskCreationRequest( model=model, prompt=prompt, @@ -467,16 +413,12 @@ class ByteDanceImageEditNode(IO.ComfyNode): guidance_scale=guidance_scale, watermark=watermark, ) - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=BYTEPLUS_IMAGE_ENDPOINT, - method=HttpMethod.POST, - request_model=Image2ImageTaskCreationRequest, - response_model=ImageTaskCreationResponse, - ), - request=payload, - auth_kwargs=auth_kwargs, - ).execute() + response = await sync_op( + cls, + ApiEndpoint(path=BYTEPLUS_IMAGE_ENDPOINT, method="POST"), + data=payload, + response_model=ImageTaskCreationResponse, + ) return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) @@ -504,7 +446,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode): IO.Image.Input( "image", tooltip="Input image(s) for image-to-image generation. " - "List of 1-10 images for single or multi-reference generation.", + "List of 1-10 images for single or multi-reference generation.", optional=True, ), IO.Combo.Input( @@ -534,9 +476,9 @@ class ByteDanceSeedreamNode(IO.ComfyNode): "sequential_image_generation", options=["disabled", "auto"], tooltip="Group image generation mode. " - "'disabled' generates a single image. " - "'auto' lets the model decide whether to generate multiple related images " - "(e.g., story scenes, character variations).", + "'disabled' generates a single image. " + "'auto' lets the model decide whether to generate multiple related images " + "(e.g., story scenes, character variations).", optional=True, ), IO.Int.Input( @@ -547,7 +489,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode): step=1, display_mode=IO.NumberDisplay.number, tooltip="Maximum number of images to generate when sequential_image_generation='auto'. " - "Total images (input + generated) cannot exceed 15.", + "Total images (input + generated) cannot exceed 15.", optional=True, ), IO.Int.Input( @@ -564,7 +506,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode): IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the image.", + tooltip='Whether to add an "AI generated" watermark to the image.', optional=True, ), IO.Boolean.Input( @@ -611,8 +553,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode): w, h = width, height if not (1024 <= w <= 4096) or not (1024 <= h <= 4096): raise ValueError( - f"Custom size out of range: {w}x{h}. " - "Both width and height must be between 1024 and 4096 pixels." + f"Custom size out of range: {w}x{h}. " "Both width and height must be between 1024 and 4096 pixels." ) n_input_images = get_number_of_images(image) if image is not None else 0 if n_input_images > 10: @@ -621,41 +562,31 @@ class ByteDanceSeedreamNode(IO.ComfyNode): raise ValueError( "The maximum number of generated images plus the number of reference images cannot exceed 15." ) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } reference_images_urls = [] if n_input_images: for i in image: validate_image_aspect_ratio_range(i, (1, 3), (3, 1)) - reference_images_urls = (await upload_images_to_comfyapi( + reference_images_urls = await upload_images_to_comfyapi( + cls, image, max_images=n_input_images, mime_type="image/png", - auth_kwargs=auth_kwargs, - )) - payload = Seedream4TaskCreationRequest( - model=model, - prompt=prompt, - image=reference_images_urls, - size=f"{w}x{h}", - seed=seed, - sequential_image_generation=sequential_image_generation, - sequential_image_generation_options=Seedream4Options(max_images=max_images), - watermark=watermark, - ) - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=BYTEPLUS_IMAGE_ENDPOINT, - method=HttpMethod.POST, - request_model=Seedream4TaskCreationRequest, - response_model=ImageTaskCreationResponse, + ) + response = await sync_op( + cls, + ApiEndpoint(path=BYTEPLUS_IMAGE_ENDPOINT, method="POST"), + response_model=ImageTaskCreationResponse, + data=Seedream4TaskCreationRequest( + model=model, + prompt=prompt, + image=reference_images_urls, + size=f"{w}x{h}", + seed=seed, + sequential_image_generation=sequential_image_generation, + sequential_image_generation_options=Seedream4Options(max_images=max_images), + watermark=watermark, ), - request=payload, - auth_kwargs=auth_kwargs, - ).execute() - + ) if len(response.data) == 1: return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) urls = [str(d["url"]) for d in response.data if isinstance(d, dict) and "url" in d] @@ -719,13 +650,13 @@ class ByteDanceTextToVideoNode(IO.ComfyNode): "camera_fixed", default=False, tooltip="Specifies whether to fix the camera. The platform appends an instruction " - "to fix the camera to your prompt, but does not guarantee the actual effect.", + "to fix the camera to your prompt, but does not guarantee the actual effect.", optional=True, ), IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the video.", + tooltip='Whether to add an "AI generated" watermark to the video.', optional=True, ), ], @@ -764,19 +695,9 @@ class ByteDanceTextToVideoNode(IO.ComfyNode): f"--camerafixed {str(camera_fixed).lower()} " f"--watermark {str(watermark).lower()}" ) - - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } return await process_video_task( - request_model=Text2VideoTaskCreationRequest, - payload=Text2VideoTaskCreationRequest( - model=model, - content=[TaskTextContent(text=prompt)], - ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, + cls, + payload=Text2VideoTaskCreationRequest(model=model, content=[TaskTextContent(text=prompt)]), estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))), ) @@ -840,13 +761,13 @@ class ByteDanceImageToVideoNode(IO.ComfyNode): "camera_fixed", default=False, tooltip="Specifies whether to fix the camera. The platform appends an instruction " - "to fix the camera to your prompt, but does not guarantee the actual effect.", + "to fix the camera to your prompt, but does not guarantee the actual effect.", optional=True, ), IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the video.", + tooltip='Whether to add an "AI generated" watermark to the video.', optional=True, ), ], @@ -879,13 +800,7 @@ class ByteDanceImageToVideoNode(IO.ComfyNode): validate_image_dimensions(image, min_width=300, min_height=300, max_width=6000, max_height=6000) validate_image_aspect_ratio_range(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - image_url = (await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=auth_kwargs))[0] - + image_url = (await upload_images_to_comfyapi(cls, image, max_images=1))[0] prompt = ( f"{prompt} " f"--resolution {resolution} " @@ -897,13 +812,11 @@ class ByteDanceImageToVideoNode(IO.ComfyNode): ) return await process_video_task( - request_model=Image2VideoTaskCreationRequest, + cls, payload=Image2VideoTaskCreationRequest( model=model, content=[TaskTextContent(text=prompt), TaskImageContent(image_url=TaskImageContentUrl(url=image_url))], ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))), ) @@ -971,13 +884,13 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode): "camera_fixed", default=False, tooltip="Specifies whether to fix the camera. The platform appends an instruction " - "to fix the camera to your prompt, but does not guarantee the actual effect.", + "to fix the camera to your prompt, but does not guarantee the actual effect.", optional=True, ), IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the video.", + tooltip='Whether to add an "AI generated" watermark to the video.', optional=True, ), ], @@ -1012,16 +925,11 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode): validate_image_dimensions(i, min_width=300, min_height=300, max_width=6000, max_height=6000) validate_image_aspect_ratio_range(i, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - download_urls = await upload_images_to_comfyapi( + cls, image_tensor_pair_to_batch(first_frame, last_frame), max_images=2, mime_type="image/png", - auth_kwargs=auth_kwargs, ) prompt = ( @@ -1035,7 +943,7 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode): ) return await process_video_task( - request_model=Image2VideoTaskCreationRequest, + cls, payload=Image2VideoTaskCreationRequest( model=model, content=[ @@ -1044,8 +952,6 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode): TaskImageContent(image_url=TaskImageContentUrl(url=str(download_urls[1])), role="last_frame"), ], ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))), ) @@ -1108,7 +1014,7 @@ class ByteDanceImageReferenceNode(IO.ComfyNode): IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the video.", + tooltip='Whether to add an "AI generated" watermark to the video.', optional=True, ), ], @@ -1141,15 +1047,7 @@ class ByteDanceImageReferenceNode(IO.ComfyNode): validate_image_dimensions(image, min_width=300, min_height=300, max_width=6000, max_height=6000) validate_image_aspect_ratio_range(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - image_urls = await upload_images_to_comfyapi( - images, max_images=4, mime_type="image/png", auth_kwargs=auth_kwargs - ) - + image_urls = await upload_images_to_comfyapi(cls, images, max_images=4, mime_type="image/png") prompt = ( f"{prompt} " f"--resolution {resolution} " @@ -1160,42 +1058,32 @@ class ByteDanceImageReferenceNode(IO.ComfyNode): ) x = [ TaskTextContent(text=prompt), - *[TaskImageContent(image_url=TaskImageContentUrl(url=str(i)), role="reference_image") for i in image_urls] + *[TaskImageContent(image_url=TaskImageContentUrl(url=str(i)), role="reference_image") for i in image_urls], ] return await process_video_task( - request_model=Image2VideoTaskCreationRequest, - payload=Image2VideoTaskCreationRequest( - model=model, - content=x, - ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, + cls, + payload=Image2VideoTaskCreationRequest(model=model, content=x), estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))), ) async def process_video_task( - request_model: Type[T], + cls: type[IO.ComfyNode], payload: Union[Text2VideoTaskCreationRequest, Image2VideoTaskCreationRequest], - auth_kwargs: dict, - node_id: str, estimated_duration: Optional[int], ) -> IO.NodeOutput: - initial_response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=BYTEPLUS_TASK_ENDPOINT, - method=HttpMethod.POST, - request_model=request_model, - response_model=TaskCreationResponse, - ), - request=payload, - auth_kwargs=auth_kwargs, - ).execute() - response = await poll_until_finished( - auth_kwargs, - initial_response.id, + initial_response = await sync_op( + cls, + ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"), + data=payload, + response_model=TaskCreationResponse, + ) + response = await poll_op( + cls, + ApiEndpoint(path=f"{BYTEPLUS_TASK_STATUS_ENDPOINT}/{initial_response.id}"), + status_extractor=lambda r: r.status, estimated_duration=estimated_duration, - node_id=node_id, + response_model=TaskStatusResponse, ) return IO.NodeOutput(await download_url_to_video_output(get_video_url_from_task_status(response))) @@ -1221,5 +1109,6 @@ class ByteDanceExtension(ComfyExtension): ByteDanceImageReferenceNode, ] + async def comfy_entrypoint() -> ByteDanceExtension: return ByteDanceExtension() diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index c1941cbe9..ca11b67ed 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -33,12 +33,9 @@ from comfy_api_nodes.apis.client import ( SynchronousOperation, ) from comfy_api_nodes.apinode_utils import ( - validate_string, - audio_to_base64_string, video_to_base64_string, - tensor_to_base64_string, - bytesio_to_image_tensor, ) +from comfy_api_nodes.util import validate_string, tensor_to_base64_string, bytesio_to_image_tensor, audio_to_base64_string from comfy_api.util import VideoContainer, VideoCodec diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 67c8307c5..eea65c9ac 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -5,8 +5,7 @@ For source of truth on the allowed permutations of request fields, please refere """ from __future__ import annotations -from typing import Optional, TypeVar, Any -from collections.abc import Callable +from typing import Optional, TypeVar import math import logging @@ -15,7 +14,6 @@ from typing_extensions import override import torch from comfy_api_nodes.apis import ( - KlingTaskStatus, KlingCameraControl, KlingCameraConfig, KlingCameraControlType, @@ -52,26 +50,20 @@ from comfy_api_nodes.apis import ( KlingCharacterEffectModelName, KlingSingleImageEffectModelName, ) -from comfy_api_nodes.apis.client import ( - ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( - tensor_to_base64_string, - download_url_to_video_output, - upload_video_to_comfyapi, - upload_audio_to_comfyapi, - download_url_to_image_tensor, - validate_string, -) -from comfy_api_nodes.util.validation_utils import ( +from comfy_api_nodes.util import ( validate_image_dimensions, validate_image_aspect_ratio, validate_video_dimensions, validate_video_duration, + tensor_to_base64_string, + validate_string, + upload_audio_to_comfyapi, + download_url_to_image_tensor, + upload_video_to_comfyapi, + download_url_to_video_output, + sync_op, + ApiEndpoint, + poll_op, ) from comfy_api.input_impl import VideoFromFile from comfy_api.input.basic_types import AudioInput @@ -214,34 +206,6 @@ VOICES_CONFIG = { } -async def poll_until_finished( - auth_kwargs: dict[str, str], - api_endpoint: ApiEndpoint[Any, R], - result_url_extractor: Optional[Callable[[R], str]] = None, - estimated_duration: Optional[int] = None, - node_id: Optional[str] = None, -) -> R: - """Polls the Kling API endpoint until the task reaches a terminal state, then returns the response.""" - return await PollingOperation( - poll_endpoint=api_endpoint, - completed_statuses=[ - KlingTaskStatus.succeed.value, - ], - failed_statuses=[KlingTaskStatus.failed.value], - status_extractor=lambda response: ( - response.data.task_status.value - if response.data and response.data.task_status - else None - ), - auth_kwargs=auth_kwargs, - result_url_extractor=result_url_extractor, - estimated_duration=estimated_duration, - node_id=node_id, - poll_interval=16.0, - max_poll_attempts=256, - ).execute() - - def is_valid_camera_control_configs(configs: list[float]) -> bool: """Verifies that at least one camera control configuration is non-zero.""" return any(not math.isclose(value, 0.0) for value in configs) @@ -377,8 +341,7 @@ async def image_result_to_node_output( async def execute_text2video( - auth_kwargs: dict[str, str], - node_id: str, + cls: type[IO.ComfyNode], prompt: str, negative_prompt: str, cfg_scale: float, @@ -389,14 +352,11 @@ async def execute_text2video( camera_control: Optional[KlingCameraControl] = None, ) -> IO.NodeOutput: validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V) - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_TEXT_TO_VIDEO, - method=HttpMethod.POST, - request_model=KlingText2VideoRequest, - response_model=KlingText2VideoResponse, - ), - request=KlingText2VideoRequest( + task_creation_response = await sync_op( + cls, + ApiEndpoint(path=PATH_TEXT_TO_VIDEO, method="POST"), + response_model=KlingText2VideoResponse, + data=KlingText2VideoRequest( prompt=prompt if prompt else None, negative_prompt=negative_prompt if negative_prompt else None, duration=KlingVideoGenDuration(duration), @@ -406,24 +366,17 @@ async def execute_text2video( aspect_ratio=KlingVideoGenAspectRatio(aspect_ratio), camera_control=camera_control, ), - auth_kwargs=auth_kwargs, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_TEXT_TO_VIDEO}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingText2VideoResponse, - ), - result_url_extractor=get_video_url_from_response, + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_TEXT_TO_VIDEO}/{task_id}"), + response_model=KlingText2VideoResponse, estimated_duration=AVERAGE_DURATION_T2V, - node_id=node_id, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), ) validate_video_result_response(final_response) @@ -432,8 +385,7 @@ async def execute_text2video( async def execute_image2video( - auth_kwargs: dict[str, str], - node_id: str, + cls: type[IO.ComfyNode], start_frame: torch.Tensor, prompt: str, negative_prompt: str, @@ -455,14 +407,11 @@ async def execute_image2video( if model_mode == "std" and model_name == KlingVideoGenModelName.kling_v2_5_turbo.value: model_mode = "pro" # October 5: currently "std" mode is not supported for this model - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_IMAGE_TO_VIDEO, - method=HttpMethod.POST, - request_model=KlingImage2VideoRequest, - response_model=KlingImage2VideoResponse, - ), - request=KlingImage2VideoRequest( + task_creation_response = await sync_op( + cls, + ApiEndpoint(path=PATH_IMAGE_TO_VIDEO, method="POST"), + response_model=KlingImage2VideoResponse, + data=KlingImage2VideoRequest( model_name=KlingVideoGenModelName(model_name), image=tensor_to_base64_string(start_frame), image_tail=( @@ -477,24 +426,17 @@ async def execute_image2video( duration=KlingVideoGenDuration(duration), camera_control=camera_control, ), - auth_kwargs=auth_kwargs, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}", - method=HttpMethod.GET, - request_model=KlingImage2VideoRequest, - response_model=KlingImage2VideoResponse, - ), - result_url_extractor=get_video_url_from_response, + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}"), + response_model=KlingImage2VideoResponse, estimated_duration=AVERAGE_DURATION_I2V, - node_id=node_id, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), ) validate_video_result_response(final_response) @@ -503,8 +445,7 @@ async def execute_image2video( async def execute_video_effect( - auth_kwargs: dict[str, str], - node_id: str, + cls: type[IO.ComfyNode], dual_character: bool, effect_scene: KlingDualCharacterEffectsScene | KlingSingleImageEffectsScene, model_name: str, @@ -530,35 +471,25 @@ async def execute_video_effect( duration=duration, ) - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_VIDEO_EFFECTS, - method=HttpMethod.POST, - request_model=KlingVideoEffectsRequest, - response_model=KlingVideoEffectsResponse, - ), - request=KlingVideoEffectsRequest( + task_creation_response = await sync_op( + cls, + endpoint=ApiEndpoint(path=PATH_VIDEO_EFFECTS, method="POST"), + response_model=KlingVideoEffectsResponse, + data=KlingVideoEffectsRequest( effect_scene=effect_scene, input=request_input_field, ), - auth_kwargs=auth_kwargs, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_VIDEO_EFFECTS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingVideoEffectsResponse, - ), - result_url_extractor=get_video_url_from_response, + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_VIDEO_EFFECTS}/{task_id}"), + response_model=KlingVideoEffectsResponse, estimated_duration=AVERAGE_DURATION_VIDEO_EFFECTS, - node_id=node_id, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), ) validate_video_result_response(final_response) @@ -567,8 +498,7 @@ async def execute_video_effect( async def execute_lipsync( - auth_kwargs: dict[str, str], - node_id: str, + cls: type[IO.ComfyNode], video: VideoInput, audio: Optional[AudioInput] = None, voice_language: Optional[str] = None, @@ -583,24 +513,21 @@ async def execute_lipsync( validate_video_duration(video, 2, 10) # Upload video to Comfy API and get download URL - video_url = await upload_video_to_comfyapi(video, auth_kwargs=auth_kwargs) + video_url = await upload_video_to_comfyapi(cls, video) logging.info("Uploaded video to Comfy API. URL: %s", video_url) # Upload the audio file to Comfy API and get download URL if audio: - audio_url = await upload_audio_to_comfyapi(audio, auth_kwargs=auth_kwargs) + audio_url = await upload_audio_to_comfyapi(cls, audio) logging.info("Uploaded audio to Comfy API. URL: %s", audio_url) else: audio_url = None - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_LIP_SYNC, - method=HttpMethod.POST, - request_model=KlingLipSyncRequest, - response_model=KlingLipSyncResponse, - ), - request=KlingLipSyncRequest( + task_creation_response = await sync_op( + cls, + ApiEndpoint(PATH_LIP_SYNC, "POST"), + response_model=KlingLipSyncResponse, + data=KlingLipSyncRequest( input=KlingLipSyncInputObject( video_url=video_url, mode=model_mode, @@ -612,24 +539,17 @@ async def execute_lipsync( voice_id=voice_id, ), ), - auth_kwargs=auth_kwargs, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_LIP_SYNC}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingLipSyncResponse, - ), - result_url_extractor=get_video_url_from_response, + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_LIP_SYNC}/{task_id}"), + response_model=KlingLipSyncResponse, estimated_duration=AVERAGE_DURATION_LIP_SYNC, - node_id=node_id, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), ) validate_video_result_response(final_response) @@ -807,11 +727,7 @@ class KlingTextToVideoNode(IO.ComfyNode): ) -> IO.NodeOutput: model_mode, duration, model_name = MODE_TEXT2VIDEO[mode] return await execute_text2video( - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, prompt=prompt, negative_prompt=negative_prompt, cfg_scale=cfg_scale, @@ -872,11 +788,7 @@ class KlingCameraControlT2VNode(IO.ComfyNode): camera_control: Optional[KlingCameraControl] = None, ) -> IO.NodeOutput: return await execute_text2video( - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, model_name=KlingVideoGenModelName.kling_v1, cfg_scale=cfg_scale, model_mode=KlingVideoGenMode.std, @@ -944,11 +856,7 @@ class KlingImage2VideoNode(IO.ComfyNode): end_frame: Optional[torch.Tensor] = None, ) -> IO.NodeOutput: return await execute_image2video( - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, start_frame=start_frame, prompt=prompt, negative_prompt=negative_prompt, @@ -1017,11 +925,7 @@ class KlingCameraControlI2VNode(IO.ComfyNode): camera_control: KlingCameraControl, ) -> IO.NodeOutput: return await execute_image2video( - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, model_name=KlingVideoGenModelName.kling_v1_5, start_frame=start_frame, cfg_scale=cfg_scale, @@ -1097,11 +1001,7 @@ class KlingStartEndFrameNode(IO.ComfyNode): ) -> IO.NodeOutput: mode, duration, model_name = MODE_START_END_FRAME[mode] return await execute_image2video( - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, prompt=prompt, negative_prompt=negative_prompt, model_name=model_name, @@ -1162,41 +1062,27 @@ class KlingVideoExtendNode(IO.ComfyNode): video_id: str, ) -> IO.NodeOutput: validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_VIDEO_EXTEND, - method=HttpMethod.POST, - request_model=KlingVideoExtendRequest, - response_model=KlingVideoExtendResponse, - ), - request=KlingVideoExtendRequest( + task_creation_response = await sync_op( + cls, + ApiEndpoint(path=PATH_VIDEO_EXTEND, method="POST"), + response_model=KlingVideoExtendResponse, + data=KlingVideoExtendRequest( prompt=prompt if prompt else None, negative_prompt=negative_prompt if negative_prompt else None, cfg_scale=cfg_scale, video_id=video_id, ), - auth_kwargs=auth, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await poll_until_finished( - auth, - ApiEndpoint( - path=f"{PATH_VIDEO_EXTEND}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingVideoExtendResponse, - ), - result_url_extractor=get_video_url_from_response, + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_VIDEO_EXTEND}/{task_id}"), + response_model=KlingVideoExtendResponse, estimated_duration=AVERAGE_DURATION_VIDEO_EXTEND, - node_id=cls.hidden.unique_id, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), ) validate_video_result_response(final_response) @@ -1259,11 +1145,7 @@ class KlingDualCharacterVideoEffectNode(IO.ComfyNode): duration: KlingVideoGenDuration, ) -> IO.NodeOutput: video, _, duration = await execute_video_effect( - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, dual_character=True, effect_scene=effect_scene, model_name=model_name, @@ -1324,11 +1206,7 @@ class KlingSingleImageVideoEffectNode(IO.ComfyNode): return IO.NodeOutput( *( await execute_video_effect( - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, dual_character=False, effect_scene=effect_scene, model_name=model_name, @@ -1379,11 +1257,7 @@ class KlingLipSyncAudioToVideoNode(IO.ComfyNode): voice_language: str, ) -> IO.NodeOutput: return await execute_lipsync( - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, video=video, audio=audio, voice_language=voice_language, @@ -1445,11 +1319,7 @@ class KlingLipSyncTextToVideoNode(IO.ComfyNode): ) -> IO.NodeOutput: voice_id, voice_language = VOICES_CONFIG[voice] return await execute_lipsync( - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, video=video, text=text, voice_language=voice_language, @@ -1496,40 +1366,26 @@ class KlingVirtualTryOnNode(IO.ComfyNode): cloth_image: torch.Tensor, model_name: KlingVirtualTryOnModelName, ) -> IO.NodeOutput: - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_VIRTUAL_TRY_ON, - method=HttpMethod.POST, - request_model=KlingVirtualTryOnRequest, - response_model=KlingVirtualTryOnResponse, - ), - request=KlingVirtualTryOnRequest( + task_creation_response = await sync_op( + cls, + ApiEndpoint(path=PATH_VIRTUAL_TRY_ON, method="POST"), + response_model=KlingVirtualTryOnResponse, + data=KlingVirtualTryOnRequest( human_image=tensor_to_base64_string(human_image), cloth_image=tensor_to_base64_string(cloth_image), model_name=model_name, ), - auth_kwargs=auth, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await poll_until_finished( - auth, - ApiEndpoint( - path=f"{PATH_VIRTUAL_TRY_ON}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingVirtualTryOnResponse, - ), - result_url_extractor=get_images_urls_from_response, + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_VIRTUAL_TRY_ON}/{task_id}"), + response_model=KlingVirtualTryOnResponse, estimated_duration=AVERAGE_DURATION_VIRTUAL_TRY_ON, - node_id=cls.hidden.unique_id, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), ) validate_image_result_response(final_response) @@ -1625,18 +1481,11 @@ class KlingImageGenerationNode(IO.ComfyNode): else: image = tensor_to_base64_string(image) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_IMAGE_GENERATIONS, - method=HttpMethod.POST, - request_model=KlingImageGenerationsRequest, - response_model=KlingImageGenerationsResponse, - ), - request=KlingImageGenerationsRequest( + task_creation_response = await sync_op( + cls, + ApiEndpoint(path=PATH_IMAGE_GENERATIONS, method="POST"), + response_model=KlingImageGenerationsResponse, + data=KlingImageGenerationsRequest( model_name=model_name, prompt=prompt, negative_prompt=negative_prompt, @@ -1647,24 +1496,17 @@ class KlingImageGenerationNode(IO.ComfyNode): n=n, aspect_ratio=aspect_ratio, ), - auth_kwargs=auth, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await poll_until_finished( - auth, - ApiEndpoint( - path=f"{PATH_IMAGE_GENERATIONS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingImageGenerationsResponse, - ), - result_url_extractor=get_images_urls_from_response, + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_IMAGE_GENERATIONS}/{task_id}"), + response_model=KlingImageGenerationsResponse, estimated_duration=AVERAGE_DURATION_IMAGE_GEN, - node_id=cls.hidden.unique_id, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), ) validate_image_result_response(final_response) diff --git a/comfy_api_nodes/nodes_luma.py b/comfy_api_nodes/nodes_luma.py index 610d95a77..e74441e5e 100644 --- a/comfy_api_nodes/nodes_luma.py +++ b/comfy_api_nodes/nodes_luma.py @@ -35,9 +35,9 @@ from comfy_api_nodes.apis.client import ( from comfy_api_nodes.apinode_utils import ( upload_images_to_comfyapi, process_image_response, - validate_string, ) from server import PromptServer +from comfy_api_nodes.util import validate_string import aiohttp import torch diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index 23be1ae65..e3722e79b 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -24,8 +24,8 @@ from comfy_api_nodes.apis.client import ( from comfy_api_nodes.apinode_utils import ( download_url_to_bytesio, upload_images_to_comfyapi, - validate_string, ) +from comfy_api_nodes.util import validate_string from server import PromptServer diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 7566188dd..7c31d95b3 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -1,35 +1,31 @@ import logging -from typing import Any, Callable, Optional, TypeVar +from typing import Optional + import torch from typing_extensions import override -from comfy_api_nodes.util.validation_utils import validate_image_dimensions +from comfy_api.input import VideoInput +from comfy_api.latest import IO, ComfyExtension from comfy_api_nodes.apis import ( - MoonvalleyTextToVideoRequest, + MoonvalleyPromptResponse, MoonvalleyTextToVideoInferenceParams, + MoonvalleyTextToVideoRequest, MoonvalleyVideoToVideoInferenceParams, MoonvalleyVideoToVideoRequest, - MoonvalleyPromptResponse, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( download_url_to_video_output, + poll_op, + sync_op, + trim_video, upload_images_to_comfyapi, upload_video_to_comfyapi, validate_container_format_is_mp4, + validate_image_dimensions, + validate_string, ) -from comfy_api.input import VideoInput -from comfy_api.latest import ComfyExtension, InputImpl, IO -import av -import io - API_UPLOADS_ENDPOINT = "/proxy/moonvalley/uploads" API_PROMPTS_ENDPOINT = "/proxy/moonvalley/prompts" API_VIDEO2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/video-to-video" @@ -51,13 +47,6 @@ MAX_VID_HEIGHT = 10000 MAX_VIDEO_SIZE = 1024 * 1024 * 1024 # 1 GB max for in-memory video processing MOONVALLEY_MAREY_MAX_PROMPT_LENGTH = 5000 -R = TypeVar("R") - - -class MoonvalleyApiError(Exception): - """Base exception for Moonvalley API errors.""" - - pass def is_valid_task_creation_response(response: MoonvalleyPromptResponse) -> bool: @@ -69,64 +58,7 @@ def validate_task_creation_response(response) -> None: if not is_valid_task_creation_response(response): error_msg = f"Moonvalley Marey API: Initial request failed. Code: {response.code}, Message: {response.message}, Data: {response}" logging.error(error_msg) - raise MoonvalleyApiError(error_msg) - - -def get_video_from_response(response): - video = response.output_url - logging.info( - "Moonvalley Marey API: Task %s succeeded. Video URL: %s", response.id, video - ) - return video - - -def get_video_url_from_response(response) -> Optional[str]: - """Returns the first video url from the Moonvalley video generation task result. - Will not raise an error if the response is not valid. - """ - if response: - return str(get_video_from_response(response)) - else: - return None - - -async def poll_until_finished( - auth_kwargs: dict[str, str], - api_endpoint: ApiEndpoint[Any, R], - result_url_extractor: Optional[Callable[[R], str]] = None, - node_id: Optional[str] = None, -) -> R: - """Polls the Moonvalley API endpoint until the task reaches a terminal state, then returns the response.""" - return await PollingOperation( - poll_endpoint=api_endpoint, - completed_statuses=[ - "completed", - ], - max_poll_attempts=240, # 64 minutes with 16s interval - poll_interval=16.0, - failed_statuses=["error"], - status_extractor=lambda response: ( - response.status if response and response.status else None - ), - auth_kwargs=auth_kwargs, - result_url_extractor=result_url_extractor, - node_id=node_id, - ).execute() - - -def validate_prompts( - prompt: str, negative_prompt: str, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH -): - """Verifies that the prompt isn't empty and that neither prompt is too long.""" - if not prompt: - raise ValueError("Positive prompt is empty") - if len(prompt) > max_length: - raise ValueError(f"Positive prompt is too long: {len(prompt)} characters") - if negative_prompt and len(negative_prompt) > max_length: - raise ValueError( - f"Negative prompt is too long: {len(negative_prompt)} characters" - ) - return True + raise RuntimeError(error_msg) def validate_video_to_video_input(video: VideoInput) -> VideoInput: @@ -170,12 +102,8 @@ def _validate_video_dimensions(width: int, height: int) -> None: } if (width, height) not in supported_resolutions: - supported_list = ", ".join( - [f"{w}x{h}" for w, h in sorted(supported_resolutions)] - ) - raise ValueError( - f"Resolution {width}x{height} not supported. Supported: {supported_list}" - ) + supported_list = ", ".join([f"{w}x{h}" for w, h in sorted(supported_resolutions)]) + raise ValueError(f"Resolution {width}x{height} not supported. Supported: {supported_list}") def _validate_and_trim_duration(video: VideoInput) -> VideoInput: @@ -188,7 +116,7 @@ def _validate_and_trim_duration(video: VideoInput) -> VideoInput: def _validate_minimum_duration(duration: float) -> None: """Ensures video is at least 5 seconds long.""" if duration < 5: - raise MoonvalleyApiError("Input video must be at least 5 seconds long.") + raise ValueError("Input video must be at least 5 seconds long.") def _trim_if_too_long(video: VideoInput, duration: float) -> VideoInput: @@ -198,123 +126,6 @@ def _trim_if_too_long(video: VideoInput, duration: float) -> VideoInput: return video -def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: - """ - Returns a new VideoInput object trimmed from the beginning to the specified duration, - using av to avoid loading entire video into memory. - - Args: - video: Input video to trim - duration_sec: Duration in seconds to keep from the beginning - - Returns: - VideoFromFile object that owns the output buffer - """ - output_buffer = io.BytesIO() - - input_container = None - output_container = None - - try: - # Get the stream source - this avoids loading entire video into memory - # when the source is already a file path - input_source = video.get_stream_source() - - # Open containers - input_container = av.open(input_source, mode="r") - output_container = av.open(output_buffer, mode="w", format="mp4") - - # Set up output streams for re-encoding - video_stream = None - audio_stream = None - - for stream in input_container.streams: - logging.info("Found stream: type=%s, class=%s", stream.type, type(stream)) - if isinstance(stream, av.VideoStream): - # Create output video stream with same parameters - video_stream = output_container.add_stream( - "h264", rate=stream.average_rate - ) - video_stream.width = stream.width - video_stream.height = stream.height - video_stream.pix_fmt = "yuv420p" - logging.info( - "Added video stream: %sx%s @ %sfps", stream.width, stream.height, stream.average_rate - ) - elif isinstance(stream, av.AudioStream): - # Create output audio stream with same parameters - audio_stream = output_container.add_stream( - "aac", rate=stream.sample_rate - ) - audio_stream.sample_rate = stream.sample_rate - audio_stream.layout = stream.layout - logging.info("Added audio stream: %sHz, %s channels", stream.sample_rate, stream.channels) - - # Calculate target frame count that's divisible by 16 - fps = input_container.streams.video[0].average_rate - estimated_frames = int(duration_sec * fps) - target_frames = ( - estimated_frames // 16 - ) * 16 # Round down to nearest multiple of 16 - - if target_frames == 0: - raise ValueError("Video too short: need at least 16 frames for Moonvalley") - - frame_count = 0 - audio_frame_count = 0 - - # Decode and re-encode video frames - if video_stream: - for frame in input_container.decode(video=0): - if frame_count >= target_frames: - break - - # Re-encode frame - for packet in video_stream.encode(frame): - output_container.mux(packet) - frame_count += 1 - - # Flush encoder - for packet in video_stream.encode(): - output_container.mux(packet) - - logging.info("Encoded %s video frames (target: %s)", frame_count, target_frames) - - # Decode and re-encode audio frames - if audio_stream: - input_container.seek(0) # Reset to beginning for audio - for frame in input_container.decode(audio=0): - if frame.time >= duration_sec: - break - - # Re-encode frame - for packet in audio_stream.encode(frame): - output_container.mux(packet) - audio_frame_count += 1 - - # Flush encoder - for packet in audio_stream.encode(): - output_container.mux(packet) - - logging.info("Encoded %s audio frames", audio_frame_count) - - # Close containers - output_container.close() - input_container.close() - - # Return as VideoFromFile using the buffer - output_buffer.seek(0) - return InputImpl.VideoFromFile(output_buffer) - - except Exception as e: - # Clean up on error - if input_container is not None: - input_container.close() - if output_container is not None: - output_container.close() - raise RuntimeError(f"Failed to trim video: {str(e)}") from e - - def parse_width_height_from_res(resolution: str): # Accepts a string like "16:9 (1920 x 1080)" and returns width, height as a dict res_map = { @@ -338,19 +149,14 @@ def parse_control_parameter(value): return control_map.get(value, control_map["Motion Transfer"]) -async def get_response( - task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None -) -> MoonvalleyPromptResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{API_PROMPTS_ENDPOINT}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MoonvalleyPromptResponse, - ), - result_url_extractor=get_video_url_from_response, - node_id=node_id, +async def get_response(cls: type[IO.ComfyNode], task_id: str) -> MoonvalleyPromptResponse: + return await poll_op( + cls, + ApiEndpoint(path=f"{API_PROMPTS_ENDPOINT}/{task_id}"), + response_model=MoonvalleyPromptResponse, + status_extractor=lambda r: (r.status if r and r.status else None), + poll_interval=16.0, + max_poll_attempts=240, ) @@ -444,14 +250,10 @@ class MoonvalleyImg2VideoNode(IO.ComfyNode): steps: int, ) -> IO.NodeOutput: validate_image_dimensions(image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH) - validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + validate_string(prompt, min_length=1, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + validate_string(negative_prompt, field_name="negative_prompt", max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = parse_width_height_from_res(resolution) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - inference_params = MoonvalleyTextToVideoInferenceParams( negative_prompt=negative_prompt, steps=steps, @@ -464,33 +266,17 @@ class MoonvalleyImg2VideoNode(IO.ComfyNode): # Get MIME type from tensor - assuming PNG format for image tensors mime_type = "image/png" - - image_url = ( - await upload_images_to_comfyapi( - image, max_images=1, auth_kwargs=auth, mime_type=mime_type - ) - )[0] - - request = MoonvalleyTextToVideoRequest( - image_url=image_url, prompt_text=prompt, inference_params=inference_params - ) - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=API_IMG2VIDEO_ENDPOINT, - method=HttpMethod.POST, - request_model=MoonvalleyTextToVideoRequest, - response_model=MoonvalleyPromptResponse, + image_url = (await upload_images_to_comfyapi(cls, image, max_images=1, mime_type=mime_type))[0] + task_creation_response = await sync_op( + cls, + endpoint=ApiEndpoint(path=API_IMG2VIDEO_ENDPOINT, method="POST"), + response_model=MoonvalleyPromptResponse, + data=MoonvalleyTextToVideoRequest( + image_url=image_url, prompt_text=prompt, inference_params=inference_params ), - request=request, - auth_kwargs=auth, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) - task_id = task_creation_response.id - - final_response = await get_response( - task_id, auth_kwargs=auth, node_id=cls.hidden.unique_id - ) + final_response = await get_response(cls, task_creation_response.id) video = await download_url_to_video_output(final_response.output_url) return IO.NodeOutput(video) @@ -582,15 +368,10 @@ class MoonvalleyVideo2VideoNode(IO.ComfyNode): steps=33, prompt_adherence=4.5, ) -> IO.NodeOutput: - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - validated_video = validate_video_to_video_input(video) - video_url = await upload_video_to_comfyapi(validated_video, auth_kwargs=auth) - - validate_prompts(prompt, negative_prompt) + video_url = await upload_video_to_comfyapi(cls, validated_video) + validate_string(prompt, min_length=1, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + validate_string(negative_prompt, field_name="negative_prompt", max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) # Only include motion_intensity for Motion Transfer control_params = {} @@ -605,35 +386,20 @@ class MoonvalleyVideo2VideoNode(IO.ComfyNode): guidance_scale=prompt_adherence, ) - control = parse_control_parameter(control_type) - - request = MoonvalleyVideoToVideoRequest( - control_type=control, - video_url=video_url, - prompt_text=prompt, - inference_params=inference_params, - ) - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=API_VIDEO2VIDEO_ENDPOINT, - method=HttpMethod.POST, - request_model=MoonvalleyVideoToVideoRequest, - response_model=MoonvalleyPromptResponse, + task_creation_response = await sync_op( + cls, + endpoint=ApiEndpoint(path=API_VIDEO2VIDEO_ENDPOINT, method="POST"), + response_model=MoonvalleyPromptResponse, + data=MoonvalleyVideoToVideoRequest( + control_type=parse_control_parameter(control_type), + video_url=video_url, + prompt_text=prompt, + inference_params=inference_params, ), - request=request, - auth_kwargs=auth, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) - task_id = task_creation_response.id - - final_response = await get_response( - task_id, auth_kwargs=auth, node_id=cls.hidden.unique_id - ) - - video = await download_url_to_video_output(final_response.output_url) - return IO.NodeOutput(video) + final_response = await get_response(cls, task_creation_response.id) + return IO.NodeOutput(await download_url_to_video_output(final_response.output_url)) class MoonvalleyTxt2VideoNode(IO.ComfyNode): @@ -720,14 +486,10 @@ class MoonvalleyTxt2VideoNode(IO.ComfyNode): seed: int, steps: int, ) -> IO.NodeOutput: - validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + validate_string(prompt, min_length=1, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + validate_string(negative_prompt, field_name="negative_prompt", max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = parse_width_height_from_res(resolution) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - inference_params = MoonvalleyTextToVideoInferenceParams( negative_prompt=negative_prompt, steps=steps, @@ -737,30 +499,16 @@ class MoonvalleyTxt2VideoNode(IO.ComfyNode): width=width_height["width"], height=width_height["height"], ) - request = MoonvalleyTextToVideoRequest( - prompt_text=prompt, inference_params=inference_params - ) - init_op = SynchronousOperation( - endpoint=ApiEndpoint( - path=API_TXT2VIDEO_ENDPOINT, - method=HttpMethod.POST, - request_model=MoonvalleyTextToVideoRequest, - response_model=MoonvalleyPromptResponse, - ), - request=request, - auth_kwargs=auth, + task_creation_response = await sync_op( + cls, + endpoint=ApiEndpoint(path=API_TXT2VIDEO_ENDPOINT, method="POST"), + response_model=MoonvalleyPromptResponse, + data=MoonvalleyTextToVideoRequest(prompt_text=prompt, inference_params=inference_params), ) - task_creation_response = await init_op.execute() validate_task_creation_response(task_creation_response) - task_id = task_creation_response.id - - final_response = await get_response( - task_id, auth_kwargs=auth, node_id=cls.hidden.unique_id - ) - - video = await download_url_to_video_output(final_response.output_url) - return IO.NodeOutput(video) + final_response = await get_response(cls, task_creation_response.id) + return IO.NodeOutput(await download_url_to_video_output(final_response.output_url)) class MoonvalleyExtension(ComfyExtension): diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index e3b81de75..c467e840c 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -43,13 +43,11 @@ from comfy_api_nodes.apis.client import ( ) from comfy_api_nodes.apinode_utils import ( - downscale_image_tensor, validate_and_cast_response, - validate_string, - tensor_to_base64_string, text_filepath_to_data_uri, ) from comfy_api_nodes.mapper_utils import model_field_to_node_input +from comfy_api_nodes.util import downscale_image_tensor, validate_string, tensor_to_base64_string RESPONSES_ENDPOINT = "/proxy/openai/v1/responses" diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py index 27cb0067b..5bb406a3b 100644 --- a/comfy_api_nodes/nodes_pika.py +++ b/comfy_api_nodes/nodes_pika.py @@ -14,11 +14,6 @@ import torch from typing_extensions import override from comfy_api.latest import ComfyExtension, IO from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoInput -from comfy_api_nodes.apinode_utils import ( - download_url_to_video_output, - tensor_to_bytesio, - validate_string, -) from comfy_api_nodes.apis import pika_defs from comfy_api_nodes.apis.client import ( ApiEndpoint, @@ -27,6 +22,7 @@ from comfy_api_nodes.apis.client import ( PollingOperation, SynchronousOperation, ) +from comfy_api_nodes.util import validate_string, download_url_to_video_output, tensor_to_bytesio R = TypeVar("R") diff --git a/comfy_api_nodes/nodes_pixverse.py b/comfy_api_nodes/nodes_pixverse.py index 438a7f80b..b2b841be8 100644 --- a/comfy_api_nodes/nodes_pixverse.py +++ b/comfy_api_nodes/nodes_pixverse.py @@ -24,10 +24,7 @@ from comfy_api_nodes.apis.client import ( PollingOperation, EmptyRequest, ) -from comfy_api_nodes.apinode_utils import ( - tensor_to_bytesio, - validate_string, -) +from comfy_api_nodes.util import validate_string, tensor_to_bytesio from comfy_api.input_impl import VideoFromFile from comfy_api.latest import ComfyExtension, IO @@ -50,7 +47,6 @@ def get_video_url_from_response( async def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None): # first, upload image to Pixverse and get image id to use in actual generation call - files = {"image": tensor_to_bytesio(image)} operation = SynchronousOperation( endpoint=ApiEndpoint( path="/proxy/pixverse/image/upload", @@ -59,16 +55,14 @@ async def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None): response_model=PixverseImageUploadResponse, ), request=EmptyRequest(), - files=files, + files={"image": tensor_to_bytesio(image)}, content_type="multipart/form-data", auth_kwargs=auth_kwargs, ) response_upload: PixverseImageUploadResponse = await operation.execute() if response_upload.Resp is None: - raise Exception( - f"PixVerse image upload request failed: '{response_upload.ErrMsg}'" - ) + raise Exception(f"PixVerse image upload request failed: '{response_upload.ErrMsg}'") return response_upload.Resp.img_id @@ -95,7 +89,6 @@ class PixverseTemplateNode(IO.ComfyNode): template_id = pixverse_templates.get(template, None) if template_id is None: raise Exception(f"Template '{template}' is not recognized.") - # just return the integer return IO.NodeOutput(template_id) diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index 8beed5675..8ee7e55c4 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -24,12 +24,10 @@ from comfy_api_nodes.apis.client import ( EmptyRequest, ) from comfy_api_nodes.apinode_utils import ( - bytesio_to_image_tensor, download_url_to_bytesio, - tensor_to_bytesio, resize_mask_to_image, - validate_string, ) +from comfy_api_nodes.util import validate_string, tensor_to_bytesio, bytesio_to_image_tensor from server import PromptServer import torch diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py index eb03a897d..0543d1d0e 100644 --- a/comfy_api_nodes/nodes_runway.py +++ b/comfy_api_nodes/nodes_runway.py @@ -11,7 +11,7 @@ User Guides: """ -from typing import Union, Optional, Any +from typing import Union, Optional from typing_extensions import override from enum import Enum @@ -21,7 +21,6 @@ from comfy_api_nodes.apis import ( RunwayImageToVideoRequest, RunwayImageToVideoResponse, RunwayTaskStatusResponse as TaskStatusResponse, - RunwayTaskStatusEnum as TaskStatus, RunwayModelEnum as Model, RunwayDurationEnum as Duration, RunwayAspectRatioEnum as AspectRatio, @@ -33,23 +32,20 @@ from comfy_api_nodes.apis import ( ReferenceImage, RunwayTextToImageAspectRatioEnum, ) -from comfy_api_nodes.apis.client import ( - ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( - upload_images_to_comfyapi, - download_url_to_video_output, +from comfy_api_nodes.util import ( image_tensor_pair_to_batch, validate_string, + validate_image_dimensions, + validate_image_aspect_ratio, + upload_images_to_comfyapi, + download_url_to_video_output, download_url_to_image_tensor, + ApiEndpoint, + sync_op, + poll_op, ) from comfy_api.input_impl import VideoFromFile from comfy_api.latest import ComfyExtension, IO -from comfy_api_nodes.util.validation_utils import validate_image_dimensions, validate_image_aspect_ratio PATH_IMAGE_TO_VIDEO = "/proxy/runway/image_to_video" PATH_TEXT_TO_IMAGE = "/proxy/runway/text_to_image" @@ -91,31 +87,6 @@ def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, N return None -async def poll_until_finished( - auth_kwargs: dict[str, str], - api_endpoint: ApiEndpoint[Any, TaskStatusResponse], - estimated_duration: Optional[int] = None, - node_id: Optional[str] = None, -) -> TaskStatusResponse: - """Polls the Runway API endpoint until the task reaches a terminal state, then returns the response.""" - return await PollingOperation( - poll_endpoint=api_endpoint, - completed_statuses=[ - TaskStatus.SUCCEEDED.value, - ], - failed_statuses=[ - TaskStatus.FAILED.value, - TaskStatus.CANCELLED.value, - ], - status_extractor=lambda response: response.status.value, - auth_kwargs=auth_kwargs, - result_url_extractor=get_video_url_from_task_status, - estimated_duration=estimated_duration, - node_id=node_id, - progress_extractor=extract_progress_from_task_status, - ).execute() - - def extract_progress_from_task_status( response: TaskStatusResponse, ) -> Union[float, None]: @@ -132,42 +103,32 @@ def get_image_url_from_task_status(response: TaskStatusResponse) -> Union[str, N async def get_response( - task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None, estimated_duration: Optional[int] = None + cls: type[IO.ComfyNode], task_id: str, estimated_duration: Optional[int] = None ) -> TaskStatusResponse: """Poll the task status until it is finished then get the response.""" - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_GET_TASK_STATUS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TaskStatusResponse, - ), + return await poll_op( + cls, + ApiEndpoint(path=f"{PATH_GET_TASK_STATUS}/{task_id}"), + response_model=TaskStatusResponse, + status_extractor=lambda r: r.status.value, estimated_duration=estimated_duration, - node_id=node_id, + progress_extractor=extract_progress_from_task_status, ) async def generate_video( + cls: type[IO.ComfyNode], request: RunwayImageToVideoRequest, - auth_kwargs: dict[str, str], - node_id: Optional[str] = None, estimated_duration: Optional[int] = None, ) -> VideoFromFile: - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_IMAGE_TO_VIDEO, - method=HttpMethod.POST, - request_model=RunwayImageToVideoRequest, - response_model=RunwayImageToVideoResponse, - ), - request=request, - auth_kwargs=auth_kwargs, + initial_response = await sync_op( + cls, + endpoint=ApiEndpoint(path=PATH_IMAGE_TO_VIDEO, method="POST"), + response_model=RunwayImageToVideoResponse, + data=request, ) - initial_response = await initial_operation.execute() - - final_response = await get_response(initial_response.id, auth_kwargs, node_id, estimated_duration) + final_response = await get_response(cls, initial_response.id, estimated_duration) if not final_response.output: raise RunwayApiError("Runway task succeeded but no video data found in response.") @@ -184,9 +145,9 @@ class RunwayImageToVideoNodeGen3a(IO.ComfyNode): display_name="Runway Image to Video (Gen3a Turbo)", category="api node/video/Runway", description="Generate a video from a single starting frame using Gen3a Turbo model. " - "Before diving in, review these best practices to ensure that " - "your input selections will set your generation up for success: " - "https://help.runwayml.com/hc/en-us/articles/33927968552339-Creating-with-Act-One-on-Gen-3-Alpha-and-Turbo.", + "Before diving in, review these best practices to ensure that " + "your input selections will set your generation up for success: " + "https://help.runwayml.com/hc/en-us/articles/33927968552339-Creating-with-Act-One-on-Gen-3-Alpha-and-Turbo.", inputs=[ IO.String.Input( "prompt", @@ -241,20 +202,16 @@ class RunwayImageToVideoNodeGen3a(IO.ComfyNode): validate_image_dimensions(start_frame, max_width=7999, max_height=7999) validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - download_urls = await upload_images_to_comfyapi( + cls, start_frame, max_images=1, mime_type="image/png", - auth_kwargs=auth_kwargs, ) return IO.NodeOutput( await generate_video( + cls, RunwayImageToVideoRequest( promptText=prompt, seed=seed, @@ -262,15 +219,9 @@ class RunwayImageToVideoNodeGen3a(IO.ComfyNode): duration=Duration(duration), ratio=AspectRatio(ratio), promptImage=RunwayPromptImageObject( - root=[ - RunwayPromptImageDetailedObject( - uri=str(download_urls[0]), position="first" - ) - ] + root=[RunwayPromptImageDetailedObject(uri=str(download_urls[0]), position="first")] ), ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, ) ) @@ -284,9 +235,9 @@ class RunwayImageToVideoNodeGen4(IO.ComfyNode): display_name="Runway Image to Video (Gen4 Turbo)", category="api node/video/Runway", description="Generate a video from a single starting frame using Gen4 Turbo model. " - "Before diving in, review these best practices to ensure that " - "your input selections will set your generation up for success: " - "https://help.runwayml.com/hc/en-us/articles/37327109429011-Creating-with-Gen-4-Video.", + "Before diving in, review these best practices to ensure that " + "your input selections will set your generation up for success: " + "https://help.runwayml.com/hc/en-us/articles/37327109429011-Creating-with-Gen-4-Video.", inputs=[ IO.String.Input( "prompt", @@ -341,20 +292,16 @@ class RunwayImageToVideoNodeGen4(IO.ComfyNode): validate_image_dimensions(start_frame, max_width=7999, max_height=7999) validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - download_urls = await upload_images_to_comfyapi( + cls, start_frame, max_images=1, mime_type="image/png", - auth_kwargs=auth_kwargs, ) return IO.NodeOutput( await generate_video( + cls, RunwayImageToVideoRequest( promptText=prompt, seed=seed, @@ -362,15 +309,9 @@ class RunwayImageToVideoNodeGen4(IO.ComfyNode): duration=Duration(duration), ratio=AspectRatio(ratio), promptImage=RunwayPromptImageObject( - root=[ - RunwayPromptImageDetailedObject( - uri=str(download_urls[0]), position="first" - ) - ] + root=[RunwayPromptImageDetailedObject(uri=str(download_urls[0]), position="first")] ), ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, estimated_duration=AVERAGE_DURATION_FLF_SECONDS, ) ) @@ -385,12 +326,12 @@ class RunwayFirstLastFrameNode(IO.ComfyNode): display_name="Runway First-Last-Frame to Video", category="api node/video/Runway", description="Upload first and last keyframes, draft a prompt, and generate a video. " - "More complex transitions, such as cases where the Last frame is completely different " - "from the First frame, may benefit from the longer 10s duration. " - "This would give the generation more time to smoothly transition between the two inputs. " - "Before diving in, review these best practices to ensure that your input selections " - "will set your generation up for success: " - "https://help.runwayml.com/hc/en-us/articles/34170748696595-Creating-with-Keyframes-on-Gen-3.", + "More complex transitions, such as cases where the Last frame is completely different " + "from the First frame, may benefit from the longer 10s duration. " + "This would give the generation more time to smoothly transition between the two inputs. " + "Before diving in, review these best practices to ensure that your input selections " + "will set your generation up for success: " + "https://help.runwayml.com/hc/en-us/articles/34170748696595-Creating-with-Keyframes-on-Gen-3.", inputs=[ IO.String.Input( "prompt", @@ -452,23 +393,19 @@ class RunwayFirstLastFrameNode(IO.ComfyNode): validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) validate_image_aspect_ratio(end_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - stacked_input_images = image_tensor_pair_to_batch(start_frame, end_frame) download_urls = await upload_images_to_comfyapi( + cls, stacked_input_images, max_images=2, mime_type="image/png", - auth_kwargs=auth_kwargs, ) if len(download_urls) != 2: raise RunwayApiError("Failed to upload one or more images to comfy api.") return IO.NodeOutput( await generate_video( + cls, RunwayImageToVideoRequest( promptText=prompt, seed=seed, @@ -477,17 +414,11 @@ class RunwayFirstLastFrameNode(IO.ComfyNode): ratio=AspectRatio(ratio), promptImage=RunwayPromptImageObject( root=[ - RunwayPromptImageDetailedObject( - uri=str(download_urls[0]), position="first" - ), - RunwayPromptImageDetailedObject( - uri=str(download_urls[1]), position="last" - ), + RunwayPromptImageDetailedObject(uri=str(download_urls[0]), position="first"), + RunwayPromptImageDetailedObject(uri=str(download_urls[1]), position="last"), ] ), ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, estimated_duration=AVERAGE_DURATION_FLF_SECONDS, ) ) @@ -502,7 +433,7 @@ class RunwayTextToImageNode(IO.ComfyNode): display_name="Runway Text to Image", category="api node/image/Runway", description="Generate an image from a text prompt using Runway's Gen 4 model. " - "You can also include reference image to guide the generation.", + "You can also include reference image to guide the generation.", inputs=[ IO.String.Input( "prompt", @@ -540,49 +471,34 @@ class RunwayTextToImageNode(IO.ComfyNode): ) -> IO.NodeOutput: validate_string(prompt, min_length=1) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - # Prepare reference images if provided reference_images = None if reference_image is not None: validate_image_dimensions(reference_image, max_width=7999, max_height=7999) validate_image_aspect_ratio(reference_image, min_aspect_ratio=0.5, max_aspect_ratio=2.0) download_urls = await upload_images_to_comfyapi( + cls, reference_image, max_images=1, mime_type="image/png", - auth_kwargs=auth_kwargs, ) reference_images = [ReferenceImage(uri=str(download_urls[0]))] - request = RunwayTextToImageRequest( - promptText=prompt, - model=Model4.gen4_image, - ratio=ratio, - referenceImages=reference_images, - ) - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_TEXT_TO_IMAGE, - method=HttpMethod.POST, - request_model=RunwayTextToImageRequest, - response_model=RunwayTextToImageResponse, + initial_response = await sync_op( + cls, + endpoint=ApiEndpoint(path=PATH_TEXT_TO_IMAGE, method="POST"), + response_model=RunwayTextToImageResponse, + data=RunwayTextToImageRequest( + promptText=prompt, + model=Model4.gen4_image, + ratio=ratio, + referenceImages=reference_images, ), - request=request, - auth_kwargs=auth_kwargs, ) - initial_response = await initial_operation.execute() - - # Poll for completion final_response = await get_response( + cls, initial_response.id, - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, estimated_duration=AVERAGE_DURATION_T2I_SECONDS, ) if not final_response.output: @@ -601,5 +517,6 @@ class RunwayExtension(ComfyExtension): RunwayTextToImageNode, ] + async def comfy_entrypoint() -> RunwayExtension: return RunwayExtension() diff --git a/comfy_api_nodes/nodes_sora.py b/comfy_api_nodes/nodes_sora.py index efc954869..92b225d40 100644 --- a/comfy_api_nodes/nodes_sora.py +++ b/comfy_api_nodes/nodes_sora.py @@ -1,23 +1,20 @@ from typing import Optional -from typing_extensions import override import torch from pydantic import BaseModel, Field -from comfy_api.latest import ComfyExtension, IO -from comfy_api_nodes.apis.client import ( - ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.util.validation_utils import get_number_of_images +from typing_extensions import override -from comfy_api_nodes.apinode_utils import ( +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.util import ( + ApiEndpoint, download_url_to_video_output, + get_number_of_images, + poll_op, + sync_op, tensor_to_bytesio, ) + class Sora2GenerationRequest(BaseModel): prompt: str = Field(...) model: str = Field(...) @@ -80,7 +77,7 @@ class OpenAIVideoSora2(IO.ComfyNode): control_after_generate=True, optional=True, tooltip="Seed to determine if node should re-run; " - "actual results are nondeterministic regardless of seed.", + "actual results are nondeterministic regardless of seed.", ), ], outputs=[ @@ -111,55 +108,34 @@ class OpenAIVideoSora2(IO.ComfyNode): if get_number_of_images(image) != 1: raise ValueError("Currently only one input image is supported.") files_input = {"input_reference": ("image.png", tensor_to_bytesio(image), "image/png")} - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - payload = Sora2GenerationRequest( - model=model, - prompt=prompt, - seconds=str(duration), - size=size, - ) - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/openai/v1/videos", - method=HttpMethod.POST, - request_model=Sora2GenerationRequest, - response_model=Sora2GenerationResponse + initial_response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/openai/v1/videos", method="POST"), + data=Sora2GenerationRequest( + model=model, + prompt=prompt, + seconds=str(duration), + size=size, ), - request=payload, files=files_input, - auth_kwargs=auth, + response_model=Sora2GenerationResponse, content_type="multipart/form-data", ) - initial_response = await initial_operation.execute() if initial_response.error: - raise Exception(initial_response.error.message) + raise Exception(initial_response.error["message"]) model_time_multiplier = 1 if model == "sora-2" else 2 - poll_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/openai/v1/videos/{initial_response.id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=Sora2GenerationResponse - ), - completed_statuses=["completed"], - failed_statuses=["failed"], + await poll_op( + cls, + poll_endpoint=ApiEndpoint(path=f"/proxy/openai/v1/videos/{initial_response.id}"), + response_model=Sora2GenerationResponse, status_extractor=lambda x: x.status, - auth_kwargs=auth, poll_interval=8.0, max_poll_attempts=160, - node_id=cls.hidden.unique_id, - estimated_duration=45 * (duration / 4) * model_time_multiplier, + estimated_duration=int(45 * (duration / 4) * model_time_multiplier), ) - await poll_operation.execute() return IO.NodeOutput( - await download_url_to_video_output( - f"/proxy/openai/v1/videos/{initial_response.id}/content", - auth_kwargs=auth, - ) + await download_url_to_video_output(f"/proxy/openai/v1/videos/{initial_response.id}/content", cls=cls), ) diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index 8af03cfd1..783666ddf 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -27,14 +27,14 @@ from comfy_api_nodes.apis.client import ( PollingOperation, EmptyRequest, ) -from comfy_api_nodes.apinode_utils import ( +from comfy_api_nodes.util import ( + validate_audio_duration, + validate_string, + audio_input_to_mp3, bytesio_to_image_tensor, tensor_to_bytesio, - validate_string, audio_bytes_to_audio_input, - audio_input_to_mp3, ) -from comfy_api_nodes.util.validation_utils import validate_audio_duration import torch import base64 diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index daeaa823e..d37e9e9b4 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -1,28 +1,21 @@ -import logging import base64 -import aiohttp -import torch from io import BytesIO -from typing import Optional + from typing_extensions import override -from comfy_api.latest import ComfyExtension, IO from comfy_api.input_impl.video_types import VideoFromFile -from comfy_api_nodes.apis import ( - VeoGenVidRequest, - VeoGenVidResponse, +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.apis.veo_api import ( VeoGenVidPollRequest, VeoGenVidPollResponse, + VeoGenVidRequest, + VeoGenVidResponse, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, -) - -from comfy_api_nodes.apinode_utils import ( - downscale_image_tensor, + download_url_to_video_output, + poll_op, + sync_op, tensor_to_base64_string, ) @@ -35,28 +28,6 @@ MODELS_MAP = { "veo-3.0-fast-generate-001": "veo-3.0-fast-generate-001", } -def convert_image_to_base64(image: torch.Tensor): - if image is None: - return None - - scaled_image = downscale_image_tensor(image, total_pixels=2048*2048) - return tensor_to_base64_string(scaled_image) - - -def get_video_url_from_response(poll_response: VeoGenVidPollResponse) -> Optional[str]: - if ( - poll_response.response - and hasattr(poll_response.response, "videos") - and poll_response.response.videos - and len(poll_response.response.videos) > 0 - ): - video = poll_response.response.videos[0] - else: - return None - if hasattr(video, "gcsUri") and video.gcsUri: - return str(video.gcsUri) - return None - class VeoVideoGenerationNode(IO.ComfyNode): """ @@ -169,18 +140,13 @@ class VeoVideoGenerationNode(IO.ComfyNode): # Prepare the instances for the request instances = [] - instance = { - "prompt": prompt - } + instance = {"prompt": prompt} # Add image if provided if image is not None: - image_base64 = convert_image_to_base64(image) + image_base64 = tensor_to_base64_string(image) if image_base64: - instance["image"] = { - "bytesBase64Encoded": image_base64, - "mimeType": "image/png" - } + instance["image"] = {"bytesBase64Encoded": image_base64, "mimeType": "image/png"} instances.append(instance) @@ -198,119 +164,77 @@ class VeoVideoGenerationNode(IO.ComfyNode): if seed > 0: parameters["seed"] = seed # Only add generateAudio for Veo 3 models - if "veo-3.0" in model: + if model.find("veo-2.0") == -1: parameters["generateAudio"] = generate_audio - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - # Initial request to start video generation - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=f"/proxy/veo/{model}/generate", - method=HttpMethod.POST, - request_model=VeoGenVidRequest, - response_model=VeoGenVidResponse - ), - request=VeoGenVidRequest( + initial_response = await sync_op( + cls, + ApiEndpoint(path=f"/proxy/veo/{model}/generate", method="POST"), + response_model=VeoGenVidResponse, + data=VeoGenVidRequest( instances=instances, - parameters=parameters + parameters=parameters, ), - auth_kwargs=auth, ) - initial_response = await initial_operation.execute() - operation_name = initial_response.name - - logging.info("Veo generation started with operation name: %s", operation_name) - - # Define status extractor function def status_extractor(response): # Only return "completed" if the operation is done, regardless of success or failure # We'll check for errors after polling completes return "completed" if response.done else "pending" - # Define progress extractor function - def progress_extractor(response): - # Could be enhanced if the API provides progress information - return None - - # Define the polling operation - poll_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/veo/{model}/poll", - method=HttpMethod.POST, - request_model=VeoGenVidPollRequest, - response_model=VeoGenVidPollResponse - ), - completed_statuses=["completed"], - failed_statuses=[], # No failed statuses, we'll handle errors after polling + poll_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/veo/{model}/poll", method="POST"), + response_model=VeoGenVidPollResponse, status_extractor=status_extractor, - progress_extractor=progress_extractor, - request=VeoGenVidPollRequest( - operationName=operation_name + data=VeoGenVidPollRequest( + operationName=initial_response.name, ), - auth_kwargs=auth, poll_interval=5.0, - result_url_extractor=get_video_url_from_response, - node_id=cls.hidden.unique_id, estimated_duration=AVERAGE_DURATION_VIDEO_GEN, ) - # Execute the polling operation - poll_response = await poll_operation.execute() - # Now check for errors in the final response # Check for error in poll response - if hasattr(poll_response, 'error') and poll_response.error: - error_message = f"Veo API error: {poll_response.error.message} (code: {poll_response.error.code})" - logging.error(error_message) - raise Exception(error_message) + if poll_response.error: + raise Exception(f"Veo API error: {poll_response.error.message} (code: {poll_response.error.code})") # Check for RAI filtered content - if (hasattr(poll_response.response, 'raiMediaFilteredCount') and - poll_response.response.raiMediaFilteredCount > 0): + if ( + hasattr(poll_response.response, "raiMediaFilteredCount") + and poll_response.response.raiMediaFilteredCount > 0 + ): # Extract reason message if available - if (hasattr(poll_response.response, 'raiMediaFilteredReasons') and - poll_response.response.raiMediaFilteredReasons): + if ( + hasattr(poll_response.response, "raiMediaFilteredReasons") + and poll_response.response.raiMediaFilteredReasons + ): reason = poll_response.response.raiMediaFilteredReasons[0] error_message = f"Content filtered by Google's Responsible AI practices: {reason} ({poll_response.response.raiMediaFilteredCount} videos filtered.)" else: error_message = f"Content filtered by Google's Responsible AI practices ({poll_response.response.raiMediaFilteredCount} videos filtered.)" - logging.error(error_message) raise Exception(error_message) # Extract video data - if poll_response.response and hasattr(poll_response.response, 'videos') and poll_response.response.videos and len(poll_response.response.videos) > 0: + if ( + poll_response.response + and hasattr(poll_response.response, "videos") + and poll_response.response.videos + and len(poll_response.response.videos) > 0 + ): video = poll_response.response.videos[0] # Check if video is provided as base64 or URL - if hasattr(video, 'bytesBase64Encoded') and video.bytesBase64Encoded: - # Decode base64 string to bytes - video_data = base64.b64decode(video.bytesBase64Encoded) - elif hasattr(video, 'gcsUri') and video.gcsUri: - # Download from URL - async with aiohttp.ClientSession() as session: - async with session.get(video.gcsUri) as video_response: - video_data = await video_response.content.read() - else: - raise Exception("Video returned but no data or URL was provided") - else: - raise Exception("Video generation completed but no video was returned") + if hasattr(video, "bytesBase64Encoded") and video.bytesBase64Encoded: + return IO.NodeOutput(VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded)))) - if not video_data: - raise Exception("No video data was returned") + if hasattr(video, "gcsUri") and video.gcsUri: + return IO.NodeOutput(await download_url_to_video_output(video.gcsUri)) - logging.info("Video generation completed successfully") - - # Convert video data to BytesIO object - video_io = BytesIO(video_data) - - # Return VideoFromFile object - return IO.NodeOutput(VideoFromFile(video_io)) + raise Exception("Video returned but no data or URL was provided") + raise Exception("Video generation completed but no video was returned") class Veo3VideoGenerationNode(VeoVideoGenerationNode): @@ -394,7 +318,10 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): IO.Combo.Input( "model", options=[ - "veo-3.1-generate", "veo-3.1-fast-generate", "veo-3.0-generate-001", "veo-3.0-fast-generate-001" + "veo-3.1-generate", + "veo-3.1-fast-generate", + "veo-3.0-generate-001", + "veo-3.0-fast-generate-001", ], default="veo-3.0-generate-001", tooltip="Veo 3 model to use for video generation", @@ -427,5 +354,6 @@ class VeoExtension(ComfyExtension): Veo3VideoGenerationNode, ] + async def comfy_entrypoint() -> VeoExtension: return VeoExtension() diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py index 639be4b2b..0e0572f8c 100644 --- a/comfy_api_nodes/nodes_vidu.py +++ b/comfy_api_nodes/nodes_vidu.py @@ -1,27 +1,23 @@ import logging from enum import Enum -from typing import Any, Callable, Optional, Literal, TypeVar -from typing_extensions import override +from typing import Literal, Optional, TypeVar import torch from pydantic import BaseModel, Field +from typing_extensions import override -from comfy_api.latest import ComfyExtension, IO -from comfy_api_nodes.util.validation_utils import ( - validate_aspect_ratio_closeness, - validate_image_dimensions, - validate_image_aspect_ratio_range, - get_number_of_images, -) -from comfy_api_nodes.apis.client import ( +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, + download_url_to_video_output, + get_number_of_images, + poll_op, + sync_op, + upload_images_to_comfyapi, + validate_aspect_ratio_closeness, + validate_image_aspect_ratio_range, + validate_image_dimensions, ) -from comfy_api_nodes.apinode_utils import download_url_to_video_output, upload_images_to_comfyapi - VIDU_TEXT_TO_VIDEO = "/proxy/vidu/text2video" VIDU_IMAGE_TO_VIDEO = "/proxy/vidu/img2video" @@ -31,8 +27,9 @@ VIDU_GET_GENERATION_STATUS = "/proxy/vidu/tasks/%s/creations" R = TypeVar("R") + class VideoModelName(str, Enum): - vidu_q1 = 'viduq1' + vidu_q1 = "viduq1" class AspectRatio(str, Enum): @@ -63,17 +60,9 @@ class TaskCreationRequest(BaseModel): images: Optional[list[str]] = Field(None, description="Base64 encoded string or image URL") -class TaskStatus(str, Enum): - created = "created" - queueing = "queueing" - processing = "processing" - success = "success" - failed = "failed" - - class TaskCreationResponse(BaseModel): task_id: str = Field(...) - state: TaskStatus = Field(...) + state: str = Field(...) created_at: str = Field(...) code: Optional[int] = Field(None, description="Error code") @@ -85,32 +74,11 @@ class TaskResult(BaseModel): class TaskStatusResponse(BaseModel): - state: TaskStatus = Field(...) + state: str = Field(...) err_code: Optional[str] = Field(None) creations: list[TaskResult] = Field(..., description="Generated results") -async def poll_until_finished( - auth_kwargs: dict[str, str], - api_endpoint: ApiEndpoint[Any, R], - result_url_extractor: Optional[Callable[[R], str]] = None, - estimated_duration: Optional[int] = None, - node_id: Optional[str] = None, -) -> R: - return await PollingOperation( - poll_endpoint=api_endpoint, - completed_statuses=[TaskStatus.success.value], - failed_statuses=[TaskStatus.failed.value], - status_extractor=lambda response: response.state.value, - auth_kwargs=auth_kwargs, - result_url_extractor=result_url_extractor, - estimated_duration=estimated_duration, - node_id=node_id, - poll_interval=16.0, - max_poll_attempts=256, - ).execute() - - def get_video_url_from_response(response) -> Optional[str]: if response.creations: return response.creations[0].url @@ -127,37 +95,27 @@ def get_video_from_response(response) -> TaskResult: async def execute_task( + cls: type[IO.ComfyNode], vidu_endpoint: str, - auth_kwargs: Optional[dict[str, str]], payload: TaskCreationRequest, estimated_duration: int, - node_id: str, ) -> R: - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=vidu_endpoint, - method=HttpMethod.POST, - request_model=TaskCreationRequest, - response_model=TaskCreationResponse, - ), - request=payload, - auth_kwargs=auth_kwargs, - ).execute() - if response.state == TaskStatus.failed: + response = await sync_op( + cls, + endpoint=ApiEndpoint(path=vidu_endpoint, method="POST"), + response_model=TaskCreationResponse, + data=payload, + ) + if response.state == "failed": error_msg = f"Vidu request failed. Code: {response.code}" logging.error(error_msg) raise RuntimeError(error_msg) - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=VIDU_GET_GENERATION_STATUS % response.task_id, - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TaskStatusResponse, - ), - result_url_extractor=get_video_url_from_response, + return await poll_op( + cls, + ApiEndpoint(path=VIDU_GET_GENERATION_STATUS % response.task_id), + response_model=TaskStatusResponse, + status_extractor=lambda r: r.state.value, estimated_duration=estimated_duration, - node_id=node_id, ) @@ -258,11 +216,7 @@ class ViduTextToVideoNode(IO.ComfyNode): resolution=resolution, movement_amplitude=movement_amplitude, ) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - results = await execute_task(VIDU_TEXT_TO_VIDEO, auth, payload, 320, cls.hidden.unique_id) + results = await execute_task(cls, VIDU_TEXT_TO_VIDEO, payload, 320) return IO.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) @@ -362,17 +316,13 @@ class ViduImageToVideoNode(IO.ComfyNode): resolution=resolution, movement_amplitude=movement_amplitude, ) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } payload.images = await upload_images_to_comfyapi( + cls, image, max_images=1, mime_type="image/png", - auth_kwargs=auth, ) - results = await execute_task(VIDU_IMAGE_TO_VIDEO, auth, payload, 120, cls.hidden.unique_id) + results = await execute_task(cls, VIDU_IMAGE_TO_VIDEO, payload, 120) return IO.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) @@ -484,17 +434,13 @@ class ViduReferenceVideoNode(IO.ComfyNode): resolution=resolution, movement_amplitude=movement_amplitude, ) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } payload.images = await upload_images_to_comfyapi( + cls, images, max_images=7, mime_type="image/png", - auth_kwargs=auth, ) - results = await execute_task(VIDU_REFERENCE_VIDEO, auth, payload, 120, cls.hidden.unique_id) + results = await execute_task(cls, VIDU_REFERENCE_VIDEO, payload, 120) return IO.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) @@ -596,15 +542,11 @@ class ViduStartEndToVideoNode(IO.ComfyNode): resolution=resolution, movement_amplitude=movement_amplitude, ) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } payload.images = [ - (await upload_images_to_comfyapi(frame, max_images=1, mime_type="image/png", auth_kwargs=auth))[0] + (await upload_images_to_comfyapi(cls, frame, max_images=1, mime_type="image/png"))[0] for frame in (first_frame, end_frame) ] - results = await execute_task(VIDU_START_END_VIDEO, auth, payload, 96, cls.hidden.unique_id) + results = await execute_task(cls, VIDU_START_END_VIDEO, payload, 96) return IO.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) @@ -618,5 +560,6 @@ class ViduExtension(ComfyExtension): ViduStartEndToVideoNode, ] + async def comfy_entrypoint() -> ViduExtension: return ViduExtension() diff --git a/comfy_api_nodes/nodes_wan.py b/comfy_api_nodes/nodes_wan.py index b089bd907..2aab3c2ff 100644 --- a/comfy_api_nodes/nodes_wan.py +++ b/comfy_api_nodes/nodes_wan.py @@ -1,28 +1,24 @@ import re -from typing import Optional, Type, Union -from typing_extensions import override +from typing import Optional import torch from pydantic import BaseModel, Field -from comfy_api.latest import ComfyExtension, Input, IO -from comfy_api_nodes.apis.client import ( - ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, - R, - T, -) -from comfy_api_nodes.util.validation_utils import get_number_of_images, validate_audio_duration +from typing_extensions import override -from comfy_api_nodes.apinode_utils import ( +from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api_nodes.util import ( + ApiEndpoint, + audio_to_base64_string, download_url_to_image_tensor, download_url_to_video_output, + get_number_of_images, + poll_op, + sync_op, tensor_to_base64_string, - audio_to_base64_string, + validate_audio_duration, ) + class Text2ImageInputField(BaseModel): prompt: str = Field(...) negative_prompt: Optional[str] = Field(None) @@ -146,53 +142,7 @@ class VideoTaskStatusResponse(BaseModel): request_id: str = Field(...) -RES_IN_PARENS = re.compile(r'\((\d+)\s*[x×]\s*(\d+)\)') - - -async def process_task( - auth_kwargs: dict[str, str], - url: str, - request_model: Type[T], - response_model: Type[R], - payload: Union[ - Text2ImageTaskCreationRequest, - Image2ImageTaskCreationRequest, - Text2VideoTaskCreationRequest, - Image2VideoTaskCreationRequest, - ], - node_id: str, - estimated_duration: int, - poll_interval: int, -) -> Type[R]: - initial_response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=url, - method=HttpMethod.POST, - request_model=request_model, - response_model=TaskCreationResponse, - ), - request=payload, - auth_kwargs=auth_kwargs, - ).execute() - - if not initial_response.output: - raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") - - return await PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=response_model, - ), - completed_statuses=["SUCCEEDED"], - failed_statuses=["FAILED", "CANCELED", "UNKNOWN"], - status_extractor=lambda x: x.output.task_status, - estimated_duration=estimated_duration, - poll_interval=poll_interval, - node_id=node_id, - auth_kwargs=auth_kwargs, - ).execute() +RES_IN_PARENS = re.compile(r"\((\d+)\s*[x×]\s*(\d+)\)") class WanTextToImageApi(IO.ComfyNode): @@ -259,7 +209,7 @@ class WanTextToImageApi(IO.ComfyNode): IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the result.", + tooltip='Whether to add an "AI generated" watermark to the result.', optional=True, ), ], @@ -286,26 +236,28 @@ class WanTextToImageApi(IO.ComfyNode): prompt_extend: bool = True, watermark: bool = True, ): - payload = Text2ImageTaskCreationRequest( - model=model, - input=Text2ImageInputField(prompt=prompt, negative_prompt=negative_prompt), - parameters=Txt2ImageParametersField( - size=f"{width}*{height}", - seed=seed, - prompt_extend=prompt_extend, - watermark=watermark, + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/wan/api/v1/services/aigc/text2image/image-synthesis", method="POST"), + response_model=TaskCreationResponse, + data=Text2ImageTaskCreationRequest( + model=model, + input=Text2ImageInputField(prompt=prompt, negative_prompt=negative_prompt), + parameters=Txt2ImageParametersField( + size=f"{width}*{height}", + seed=seed, + prompt_extend=prompt_extend, + watermark=watermark, + ), ), ) - response = await process_task( - { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - "/proxy/wan/api/v1/services/aigc/text2image/image-synthesis", - request_model=Text2ImageTaskCreationRequest, + if not initial_response.output: + raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), response_model=ImageTaskStatusResponse, - payload=payload, - node_id=cls.hidden.unique_id, + status_extractor=lambda x: x.output.task_status, estimated_duration=9, poll_interval=3, ) @@ -320,7 +272,7 @@ class WanImageToImageApi(IO.ComfyNode): display_name="Wan Image to Image", category="api node/image/Wan", description="Generates an image from one or two input images and a text prompt. " - "The output image is currently fixed at 1.6 MP; its aspect ratio matches the input image(s).", + "The output image is currently fixed at 1.6 MP; its aspect ratio matches the input image(s).", inputs=[ IO.Combo.Input( "model", @@ -376,7 +328,7 @@ class WanImageToImageApi(IO.ComfyNode): IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the result.", + tooltip='Whether to add an "AI generated" watermark to the result.', optional=True, ), ], @@ -408,28 +360,30 @@ class WanImageToImageApi(IO.ComfyNode): raise ValueError(f"Expected 1 or 2 input images, got {n_images}.") images = [] for i in image: - images.append("data:image/png;base64," + tensor_to_base64_string(i, total_pixels=4096*4096)) - payload = Image2ImageTaskCreationRequest( - model=model, - input=Image2ImageInputField(prompt=prompt, negative_prompt=negative_prompt, images=images), - parameters=Image2ImageParametersField( - # size=f"{width}*{height}", - seed=seed, - watermark=watermark, + images.append("data:image/png;base64," + tensor_to_base64_string(i, total_pixels=4096 * 4096)) + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/wan/api/v1/services/aigc/image2image/image-synthesis", method="POST"), + response_model=TaskCreationResponse, + data=Image2ImageTaskCreationRequest( + model=model, + input=Image2ImageInputField(prompt=prompt, negative_prompt=negative_prompt, images=images), + parameters=Image2ImageParametersField( + # size=f"{width}*{height}", + seed=seed, + watermark=watermark, + ), ), ) - response = await process_task( - { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - "/proxy/wan/api/v1/services/aigc/image2image/image-synthesis", - request_model=Image2ImageTaskCreationRequest, + if not initial_response.output: + raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), response_model=ImageTaskStatusResponse, - payload=payload, - node_id=cls.hidden.unique_id, + status_extractor=lambda x: x.output.task_status, estimated_duration=42, - poll_interval=3, + poll_interval=4, ) return IO.NodeOutput(await download_url_to_image_tensor(str(response.output.results[0].url))) @@ -523,7 +477,7 @@ class WanTextToVideoApi(IO.ComfyNode): IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the result.", + tooltip='Whether to add an "AI generated" watermark to the result.', optional=True, ), ], @@ -557,28 +511,31 @@ class WanTextToVideoApi(IO.ComfyNode): if audio is not None: validate_audio_duration(audio, 3.0, 29.0) audio_url = "data:audio/mp3;base64," + audio_to_base64_string(audio, "mp3", "libmp3lame") - payload = Text2VideoTaskCreationRequest( - model=model, - input=Text2VideoInputField(prompt=prompt, negative_prompt=negative_prompt, audio_url=audio_url), - parameters=Text2VideoParametersField( - size=f"{width}*{height}", - duration=duration, - seed=seed, - audio=generate_audio, - prompt_extend=prompt_extend, - watermark=watermark, + + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", method="POST"), + response_model=TaskCreationResponse, + data=Text2VideoTaskCreationRequest( + model=model, + input=Text2VideoInputField(prompt=prompt, negative_prompt=negative_prompt, audio_url=audio_url), + parameters=Text2VideoParametersField( + size=f"{width}*{height}", + duration=duration, + seed=seed, + audio=generate_audio, + prompt_extend=prompt_extend, + watermark=watermark, + ), ), ) - response = await process_task( - { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - "/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", - request_model=Text2VideoTaskCreationRequest, + if not initial_response.output: + raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), response_model=VideoTaskStatusResponse, - payload=payload, - node_id=cls.hidden.unique_id, + status_extractor=lambda x: x.output.task_status, estimated_duration=120 * int(duration / 5), poll_interval=6, ) @@ -667,7 +624,7 @@ class WanImageToVideoApi(IO.ComfyNode): IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the result.", + tooltip='Whether to add an "AI generated" watermark to the result.', optional=True, ), ], @@ -699,35 +656,37 @@ class WanImageToVideoApi(IO.ComfyNode): ): if get_number_of_images(image) != 1: raise ValueError("Exactly one input image is required.") - image_url = "data:image/png;base64," + tensor_to_base64_string(image, total_pixels=2000*2000) + image_url = "data:image/png;base64," + tensor_to_base64_string(image, total_pixels=2000 * 2000) audio_url = None if audio is not None: validate_audio_duration(audio, 3.0, 29.0) audio_url = "data:audio/mp3;base64," + audio_to_base64_string(audio, "mp3", "libmp3lame") - payload = Image2VideoTaskCreationRequest( - model=model, - input=Image2VideoInputField( - prompt=prompt, negative_prompt=negative_prompt, img_url=image_url, audio_url=audio_url - ), - parameters=Image2VideoParametersField( - resolution=resolution, - duration=duration, - seed=seed, - audio=generate_audio, - prompt_extend=prompt_extend, - watermark=watermark, + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", method="POST"), + response_model=TaskCreationResponse, + data=Image2VideoTaskCreationRequest( + model=model, + input=Image2VideoInputField( + prompt=prompt, negative_prompt=negative_prompt, img_url=image_url, audio_url=audio_url + ), + parameters=Image2VideoParametersField( + resolution=resolution, + duration=duration, + seed=seed, + audio=generate_audio, + prompt_extend=prompt_extend, + watermark=watermark, + ), ), ) - response = await process_task( - { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - "/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", - request_model=Image2VideoTaskCreationRequest, + if not initial_response.output: + raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), response_model=VideoTaskStatusResponse, - payload=payload, - node_id=cls.hidden.unique_id, + status_extractor=lambda x: x.output.task_status, estimated_duration=120 * int(duration / 5), poll_interval=6, ) diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py index e69de29bb..c2ec391aa 100644 --- a/comfy_api_nodes/util/__init__.py +++ b/comfy_api_nodes/util/__init__.py @@ -0,0 +1,87 @@ +from ._helpers import get_fs_object_size +from .client import ( + ApiEndpoint, + poll_op, + poll_op_raw, + sync_op, + sync_op_raw, +) +from .conversions import ( + audio_bytes_to_audio_input, + audio_input_to_mp3, + audio_to_base64_string, + bytesio_to_image_tensor, + downscale_image_tensor, + image_tensor_pair_to_batch, + pil_to_bytesio, + tensor_to_base64_string, + tensor_to_bytesio, + tensor_to_pil, + trim_video, +) +from .download_helpers import ( + download_url_to_bytesio, + download_url_to_image_tensor, + download_url_to_video_output, +) +from .upload_helpers import ( + upload_audio_to_comfyapi, + upload_file_to_comfyapi, + upload_images_to_comfyapi, + upload_video_to_comfyapi, +) +from .validation_utils import ( + get_number_of_images, + validate_aspect_ratio_closeness, + validate_audio_duration, + validate_container_format_is_mp4, + validate_image_aspect_ratio, + validate_image_aspect_ratio_range, + validate_image_dimensions, + validate_string, + validate_video_dimensions, + validate_video_duration, +) + +__all__ = [ + # API client + "ApiEndpoint", + "poll_op", + "poll_op_raw", + "sync_op", + "sync_op_raw", + # Upload helpers + "upload_audio_to_comfyapi", + "upload_file_to_comfyapi", + "upload_images_to_comfyapi", + "upload_video_to_comfyapi", + # Download helpers + "download_url_to_bytesio", + "download_url_to_image_tensor", + "download_url_to_video_output", + # Conversions + "audio_bytes_to_audio_input", + "audio_input_to_mp3", + "audio_to_base64_string", + "bytesio_to_image_tensor", + "downscale_image_tensor", + "image_tensor_pair_to_batch", + "pil_to_bytesio", + "tensor_to_base64_string", + "tensor_to_bytesio", + "tensor_to_pil", + "trim_video", + # Validation utilities + "get_number_of_images", + "validate_aspect_ratio_closeness", + "validate_audio_duration", + "validate_container_format_is_mp4", + "validate_image_aspect_ratio", + "validate_image_aspect_ratio_range", + "validate_image_dimensions", + "validate_string", + "validate_video_dimensions", + "validate_video_duration", + # Misc functions + "get_fs_object_size", +] diff --git a/comfy_api_nodes/util/_helpers.py b/comfy_api_nodes/util/_helpers.py new file mode 100644 index 000000000..328fe5227 --- /dev/null +++ b/comfy_api_nodes/util/_helpers.py @@ -0,0 +1,71 @@ +import asyncio +import contextlib +import os +import time +from io import BytesIO +from typing import Callable, Optional, Union + +from comfy.cli_args import args +from comfy.model_management import processing_interrupted +from comfy_api.latest import IO + +from .common_exceptions import ProcessingInterrupted + + +def is_processing_interrupted() -> bool: + """Return True if user/runtime requested interruption.""" + return processing_interrupted() + + +def get_node_id(node_cls: type[IO.ComfyNode]) -> str: + return node_cls.hidden.unique_id + + +def get_auth_header(node_cls: type[IO.ComfyNode]) -> dict[str, str]: + if node_cls.hidden.auth_token_comfy_org: + return {"Authorization": f"Bearer {node_cls.hidden.auth_token_comfy_org}"} + if node_cls.hidden.api_key_comfy_org: + return {"X-API-KEY": node_cls.hidden.api_key_comfy_org} + return {} + + +def default_base_url() -> str: + return getattr(args, "comfy_api_base", "https://api.comfy.org") + + +async def sleep_with_interrupt( + seconds: float, + node_cls: Optional[type[IO.ComfyNode]], + label: Optional[str] = None, + start_ts: Optional[float] = None, + estimated_total: Optional[int] = None, + *, + display_callback: Optional[Callable[[type[IO.ComfyNode], str, int, Optional[int]], None]] = None, +): + """ + Sleep in 1s slices while: + - Checking for interruption (raises ProcessingInterrupted). + - Optionally emitting time progress via display_callback (if provided). + """ + end = time.monotonic() + seconds + while True: + if is_processing_interrupted(): + raise ProcessingInterrupted("Task cancelled") + now = time.monotonic() + if start_ts is not None and label and display_callback: + with contextlib.suppress(Exception): + display_callback(node_cls, label, int(now - start_ts), estimated_total) + if now >= end: + break + await asyncio.sleep(min(1.0, end - now)) + + +def mimetype_to_extension(mime_type: str) -> str: + """Converts a MIME type to a file extension.""" + return mime_type.split("/")[-1].lower() + + +def get_fs_object_size(path_or_object: Union[str, BytesIO]) -> int: + if isinstance(path_or_object, str): + return os.path.getsize(path_or_object) + return len(path_or_object.getvalue()) diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py new file mode 100644 index 000000000..5833b118f --- /dev/null +++ b/comfy_api_nodes/util/client.py @@ -0,0 +1,941 @@ +import asyncio +import contextlib +import json +import logging +import socket +import time +import uuid +from dataclasses import dataclass +from enum import Enum +from io import BytesIO +from typing import Any, Callable, Iterable, Literal, Optional, Type, TypeVar, Union +from urllib.parse import urljoin, urlparse + +import aiohttp +from aiohttp.client_exceptions import ClientError, ContentTypeError +from pydantic import BaseModel + +from comfy import utils +from comfy_api.latest import IO +from comfy_api_nodes.apis import request_logger +from server import PromptServer + +from ._helpers import ( + default_base_url, + get_auth_header, + get_node_id, + is_processing_interrupted, + sleep_with_interrupt, +) +from .common_exceptions import ApiServerError, LocalNetworkError, ProcessingInterrupted + +M = TypeVar("M", bound=BaseModel) + + +class ApiEndpoint: + def __init__( + self, + path: str, + method: Literal["GET", "POST", "PUT", "DELETE", "PATCH"] = "GET", + *, + query_params: Optional[dict[str, Any]] = None, + headers: Optional[dict[str, str]] = None, + ): + self.path = path + self.method = method + self.query_params = query_params or {} + self.headers = headers or {} + + +@dataclass +class _RequestConfig: + node_cls: type[IO.ComfyNode] + endpoint: ApiEndpoint + timeout: float + content_type: str + data: Optional[dict[str, Any]] + files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] + multipart_parser: Optional[Callable] + max_retries: int + retry_delay: float + retry_backoff: float + wait_label: str = "Waiting" + monitor_progress: bool = True + estimated_total: Optional[int] = None + final_label_on_success: Optional[str] = "Completed" + progress_origin_ts: Optional[float] = None + + +@dataclass +class _PollUIState: + started: float + status_label: str = "Queued" + is_queued: bool = True + price: Optional[float] = None + estimated_duration: Optional[int] = None + base_processing_elapsed: float = 0.0 # sum of completed active intervals + active_since: Optional[float] = None # start time of current active interval (None if queued) + + +_RETRY_STATUS = {408, 429, 500, 502, 503, 504} +COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed"] +FAILED_STATUSES = ["cancelled", "canceled", "failed", "error"] +QUEUED_STATUSES = ["created", "queued", "queueing", "submitted"] + + +async def sync_op( + cls: type[IO.ComfyNode], + endpoint: ApiEndpoint, + *, + response_model: Type[M], + data: Optional[BaseModel] = None, + files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None, + content_type: str = "application/json", + timeout: float = 3600.0, + multipart_parser: Optional[Callable] = None, + max_retries: int = 3, + retry_delay: float = 1.0, + retry_backoff: float = 2.0, + wait_label: str = "Waiting for server", + estimated_duration: Optional[int] = None, + final_label_on_success: Optional[str] = "Completed", + progress_origin_ts: Optional[float] = None, + monitor_progress: bool = True, +) -> M: + raw = await sync_op_raw( + cls, + endpoint, + data=data, + files=files, + content_type=content_type, + timeout=timeout, + multipart_parser=multipart_parser, + max_retries=max_retries, + retry_delay=retry_delay, + retry_backoff=retry_backoff, + wait_label=wait_label, + estimated_duration=estimated_duration, + as_binary=False, + final_label_on_success=final_label_on_success, + progress_origin_ts=progress_origin_ts, + monitor_progress=monitor_progress, + ) + if not isinstance(raw, dict): + raise Exception("Expected JSON response to validate into a Pydantic model, got non-JSON (binary or text).") + return _validate_or_raise(response_model, raw) + + +async def poll_op( + cls: type[IO.ComfyNode], + poll_endpoint: ApiEndpoint, + *, + response_model: Type[M], + status_extractor: Callable[[M], Optional[Union[str, int]]], + progress_extractor: Optional[Callable[[M], Optional[int]]] = None, + price_extractor: Optional[Callable[[M], Optional[float]]] = None, + completed_statuses: Optional[list[Union[str, int]]] = None, + failed_statuses: Optional[list[Union[str, int]]] = None, + queued_statuses: Optional[list[Union[str, int]]] = None, + data: Optional[BaseModel] = None, + poll_interval: float = 5.0, + max_poll_attempts: int = 120, + timeout_per_poll: float = 120.0, + max_retries_per_poll: int = 3, + retry_delay_per_poll: float = 1.0, + retry_backoff_per_poll: float = 2.0, + estimated_duration: Optional[int] = None, + cancel_endpoint: Optional[ApiEndpoint] = None, + cancel_timeout: float = 10.0, +) -> M: + raw = await poll_op_raw( + cls, + poll_endpoint=poll_endpoint, + status_extractor=_wrap_model_extractor(response_model, status_extractor), + progress_extractor=_wrap_model_extractor(response_model, progress_extractor), + price_extractor=_wrap_model_extractor(response_model, price_extractor), + completed_statuses=completed_statuses, + failed_statuses=failed_statuses, + queued_statuses=queued_statuses, + data=data, + poll_interval=poll_interval, + max_poll_attempts=max_poll_attempts, + timeout_per_poll=timeout_per_poll, + max_retries_per_poll=max_retries_per_poll, + retry_delay_per_poll=retry_delay_per_poll, + retry_backoff_per_poll=retry_backoff_per_poll, + estimated_duration=estimated_duration, + cancel_endpoint=cancel_endpoint, + cancel_timeout=cancel_timeout, + ) + if not isinstance(raw, dict): + raise Exception("Expected JSON response to validate into a Pydantic model, got non-JSON (binary or text).") + return _validate_or_raise(response_model, raw) + + +async def sync_op_raw( + cls: type[IO.ComfyNode], + endpoint: ApiEndpoint, + *, + data: Optional[Union[dict[str, Any], BaseModel]] = None, + files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None, + content_type: str = "application/json", + timeout: float = 3600.0, + multipart_parser: Optional[Callable] = None, + max_retries: int = 3, + retry_delay: float = 1.0, + retry_backoff: float = 2.0, + wait_label: str = "Waiting for server", + estimated_duration: Optional[int] = None, + as_binary: bool = False, + final_label_on_success: Optional[str] = "Completed", + progress_origin_ts: Optional[float] = None, + monitor_progress: bool = True, +) -> Union[dict[str, Any], bytes]: + """ + Make a single network request. + - If as_binary=False (default): returns JSON dict (or {'_raw': ''} if non-JSON). + - If as_binary=True: returns bytes. + """ + if isinstance(data, BaseModel): + data = data.model_dump(exclude_none=True) + for k, v in list(data.items()): + if isinstance(v, Enum): + data[k] = v.value + cfg = _RequestConfig( + node_cls=cls, + endpoint=endpoint, + timeout=timeout, + content_type=content_type, + data=data, + files=files, + multipart_parser=multipart_parser, + max_retries=max_retries, + retry_delay=retry_delay, + retry_backoff=retry_backoff, + wait_label=wait_label, + monitor_progress=monitor_progress, + estimated_total=estimated_duration, + final_label_on_success=final_label_on_success, + progress_origin_ts=progress_origin_ts, + ) + return await _request_base(cfg, expect_binary=as_binary) + + +async def poll_op_raw( + cls: type[IO.ComfyNode], + poll_endpoint: ApiEndpoint, + *, + status_extractor: Callable[[dict[str, Any]], Optional[Union[str, int]]], + progress_extractor: Optional[Callable[[dict[str, Any]], Optional[int]]] = None, + price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None, + completed_statuses: Optional[list[Union[str, int]]] = None, + failed_statuses: Optional[list[Union[str, int]]] = None, + queued_statuses: Optional[list[Union[str, int]]] = None, + data: Optional[Union[dict[str, Any], BaseModel]] = None, + poll_interval: float = 5.0, + max_poll_attempts: int = 120, + timeout_per_poll: float = 120.0, + max_retries_per_poll: int = 3, + retry_delay_per_poll: float = 1.0, + retry_backoff_per_poll: float = 2.0, + estimated_duration: Optional[int] = None, + cancel_endpoint: Optional[ApiEndpoint] = None, + cancel_timeout: float = 10.0, +) -> dict[str, Any]: + """ + Polls an endpoint until the task reaches a terminal state. Displays time while queued/processing, + checks interruption every second, and calls Cancel endpoint (if provided) on interruption. + + Uses default complete, failed and queued states assumption. + + Returns the final JSON response from the poll endpoint. + """ + completed_states = _normalize_statuses(COMPLETED_STATUSES if completed_statuses is None else completed_statuses) + failed_states = _normalize_statuses(FAILED_STATUSES if failed_statuses is None else failed_statuses) + queued_states = _normalize_statuses(QUEUED_STATUSES if queued_statuses is None else queued_statuses) + started = time.monotonic() + consumed_attempts = 0 # counts only non-queued polls + + progress_bar = utils.ProgressBar(100) if progress_extractor else None + last_progress: Optional[int] = None + + state = _PollUIState(started=started, estimated_duration=estimated_duration) + stop_ticker = asyncio.Event() + + async def _ticker(): + """Emit a UI update every second while polling is in progress.""" + try: + while not stop_ticker.is_set(): + if is_processing_interrupted(): + break + now = time.monotonic() + proc_elapsed = state.base_processing_elapsed + ( + (now - state.active_since) if state.active_since is not None else 0.0 + ) + _display_time_progress( + cls, + status=state.status_label, + elapsed_seconds=int(now - state.started), + estimated_total=state.estimated_duration, + price=state.price, + is_queued=state.is_queued, + processing_elapsed_seconds=int(proc_elapsed), + ) + await asyncio.sleep(1.0) + except Exception as exc: + logging.debug("Polling ticker exited: %s", exc) + + ticker_task = asyncio.create_task(_ticker()) + try: + while consumed_attempts < max_poll_attempts: + try: + resp_json = await sync_op_raw( + cls, + poll_endpoint, + data=data, + timeout=timeout_per_poll, + max_retries=max_retries_per_poll, + retry_delay=retry_delay_per_poll, + retry_backoff=retry_backoff_per_poll, + wait_label="Checking", + estimated_duration=None, + as_binary=False, + final_label_on_success=None, + monitor_progress=False, + ) + if not isinstance(resp_json, dict): + raise Exception("Polling endpoint returned non-JSON response.") + except ProcessingInterrupted: + if cancel_endpoint: + with contextlib.suppress(Exception): + await sync_op_raw( + cls, + cancel_endpoint, + timeout=cancel_timeout, + max_retries=0, + wait_label="Cancelling task", + estimated_duration=None, + as_binary=False, + final_label_on_success=None, + monitor_progress=False, + ) + raise + + try: + status = _normalize_status_value(status_extractor(resp_json)) + except Exception as e: + logging.error("Status extraction failed: %s", e) + status = None + + if price_extractor: + new_price = price_extractor(resp_json) + if new_price is not None: + state.price = new_price + + if progress_extractor: + new_progress = progress_extractor(resp_json) + if new_progress is not None and last_progress != new_progress: + progress_bar.update_absolute(new_progress, total=100) + last_progress = new_progress + + now_ts = time.monotonic() + is_queued = status in queued_states + + if is_queued: + if state.active_since is not None: # If we just moved from active -> queued, close the active interval + state.base_processing_elapsed += now_ts - state.active_since + state.active_since = None + else: + if state.active_since is None: # If we just moved from queued -> active, open a new active interval + state.active_since = now_ts + + state.is_queued = is_queued + state.status_label = status or ("Queued" if is_queued else "Processing") + if status in completed_states: + if state.active_since is not None: + state.base_processing_elapsed += now_ts - state.active_since + state.active_since = None + stop_ticker.set() + with contextlib.suppress(Exception): + await ticker_task + + if progress_bar and last_progress != 100: + progress_bar.update_absolute(100, total=100) + + _display_time_progress( + cls, + status=status if status else "Completed", + elapsed_seconds=int(now_ts - started), + estimated_total=estimated_duration, + price=state.price, + is_queued=False, + processing_elapsed_seconds=int(state.base_processing_elapsed), + ) + return resp_json + + if status in failed_states: + msg = f"Task failed: {json.dumps(resp_json)}" + logging.error(msg) + raise Exception(msg) + + try: + await sleep_with_interrupt(poll_interval, cls, None, None, None) + except ProcessingInterrupted: + if cancel_endpoint: + with contextlib.suppress(Exception): + await sync_op_raw( + cls, + cancel_endpoint, + timeout=cancel_timeout, + max_retries=0, + wait_label="Cancelling task", + estimated_duration=None, + as_binary=False, + final_label_on_success=None, + monitor_progress=False, + ) + raise + if not is_queued: + consumed_attempts += 1 + + raise Exception( + f"Polling timed out after {max_poll_attempts} non-queued attempts " + f"(~{int(max_poll_attempts * poll_interval)}s of active polling)." + ) + except ProcessingInterrupted: + raise + except (LocalNetworkError, ApiServerError): + raise + except Exception as e: + raise Exception(f"Polling aborted due to error: {e}") from e + finally: + stop_ticker.set() + with contextlib.suppress(Exception): + await ticker_task + + +def _display_text( + node_cls: type[IO.ComfyNode], + text: Optional[str], + *, + status: Optional[Union[str, int]] = None, + price: Optional[float] = None, +) -> None: + display_lines: list[str] = [] + if status: + display_lines.append(f"Status: {status.capitalize() if isinstance(status, str) else status}") + if price is not None: + display_lines.append(f"Price: ${float(price):,.4f}") + if text is not None: + display_lines.append(text) + if display_lines: + PromptServer.instance.send_progress_text("\n".join(display_lines), get_node_id(node_cls)) + + +def _display_time_progress( + node_cls: type[IO.ComfyNode], + status: Optional[Union[str, int]], + elapsed_seconds: int, + estimated_total: Optional[int] = None, + *, + price: Optional[float] = None, + is_queued: Optional[bool] = None, + processing_elapsed_seconds: Optional[int] = None, +) -> None: + if estimated_total is not None and estimated_total > 0 and is_queued is False: + pe = processing_elapsed_seconds if processing_elapsed_seconds is not None else elapsed_seconds + remaining = max(0, int(estimated_total) - int(pe)) + time_line = f"Time elapsed: {int(elapsed_seconds)}s (~{remaining}s remaining)" + else: + time_line = f"Time elapsed: {int(elapsed_seconds)}s" + _display_text(node_cls, time_line, status=status, price=price) + + +async def _diagnose_connectivity() -> dict[str, bool]: + """Best-effort connectivity diagnostics to distinguish local vs. server issues.""" + results = { + "internet_accessible": False, + "api_accessible": False, + "is_local_issue": False, + "is_api_issue": False, + } + timeout = aiohttp.ClientTimeout(total=5.0) + async with aiohttp.ClientSession(timeout=timeout) as session: + try: + async with session.get("https://www.google.com") as resp: + results["internet_accessible"] = resp.status < 500 + except (ClientError, asyncio.TimeoutError, socket.gaierror): + results["is_local_issue"] = True + return results + + parsed = urlparse(default_base_url()) + health_url = f"{parsed.scheme}://{parsed.netloc}/health" + with contextlib.suppress(ClientError, asyncio.TimeoutError): + async with session.get(health_url) as resp: + results["api_accessible"] = resp.status < 500 + results["is_api_issue"] = results["internet_accessible"] and not results["api_accessible"] + return results + + +def _unpack_tuple(t: tuple) -> tuple[str, Any, str]: + """Normalize (filename, value, content_type).""" + if len(t) == 2: + return t[0], t[1], "application/octet-stream" + if len(t) == 3: + return t[0], t[1], t[2] + raise ValueError("files tuple must be (filename, file[, content_type])") + + +def _merge_params(endpoint_params: dict[str, Any], method: str, data: Optional[dict[str, Any]]) -> dict[str, Any]: + params = dict(endpoint_params or {}) + if method.upper() == "GET" and data: + for k, v in data.items(): + if v is not None: + params[k] = v + return params + + +def _friendly_http_message(status: int, body: Any) -> str: + if status == 401: + return "Unauthorized: Please login first to use this node." + if status == 402: + return "Payment Required: Please add credits to your account to use this node." + if status == 409: + return "There is a problem with your account. Please contact support@comfy.org." + if status == 429: + return "Rate Limit Exceeded: Please try again later." + try: + if isinstance(body, dict): + err = body.get("error") + if isinstance(err, dict): + msg = err.get("message") + typ = err.get("type") + if msg and typ: + return f"API Error: {msg} (Type: {typ})" + if msg: + return f"API Error: {msg}" + return f"API Error: {json.dumps(body)}" + else: + txt = str(body) + if len(txt) <= 200: + return f"API Error (raw): {txt}" + return f"API Error (status {status})" + except Exception: + return f"HTTP {status}: Unknown error" + + +def _generate_operation_id(method: str, path: str, attempt: int) -> str: + slug = path.strip("/").replace("/", "_") or "op" + return f"{method}_{slug}_try{attempt}_{uuid.uuid4().hex[:8]}" + + +def _snapshot_request_body_for_logging( + content_type: str, + method: str, + data: Optional[dict[str, Any]], + files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]], +) -> Optional[Union[dict[str, Any], str]]: + if method.upper() == "GET": + return None + if content_type == "multipart/form-data": + form_fields = sorted([k for k, v in (data or {}).items() if v is not None]) + file_fields: list[dict[str, str]] = [] + if files: + file_iter = files if isinstance(files, list) else list(files.items()) + for field_name, file_obj in file_iter: + if file_obj is None: + continue + if isinstance(file_obj, tuple): + filename = file_obj[0] + else: + filename = getattr(file_obj, "name", field_name) + file_fields.append({"field": field_name, "filename": str(filename or "")}) + return {"_multipart": True, "form_fields": form_fields, "file_fields": file_fields} + if content_type == "application/x-www-form-urlencoded": + return data or {} + return data or {} + + +async def _request_base(cfg: _RequestConfig, expect_binary: bool): + """Core request with retries, per-second interruption monitoring, true cancellation, and friendly errors.""" + url = cfg.endpoint.path + parsed_url = urlparse(url) + if not parsed_url.scheme and not parsed_url.netloc: # is URL relative? + url = urljoin(default_base_url().rstrip("/") + "/", url.lstrip("/")) + + method = cfg.endpoint.method + params = _merge_params(cfg.endpoint.query_params, method, cfg.data if method == "GET" else None) + + async def _monitor(stop_evt: asyncio.Event, start_ts: float): + """Every second: update elapsed time and signal interruption.""" + try: + while not stop_evt.is_set(): + if is_processing_interrupted(): + return + if cfg.monitor_progress: + _display_time_progress( + cfg.node_cls, cfg.wait_label, int(time.monotonic() - start_ts), cfg.estimated_total + ) + await asyncio.sleep(1.0) + except asyncio.CancelledError: + return # normal shutdown + + start_time = cfg.progress_origin_ts if cfg.progress_origin_ts is not None else time.monotonic() + attempt = 0 + delay = cfg.retry_delay + operation_succeeded: bool = False + final_elapsed_seconds: Optional[int] = None + while True: + attempt += 1 + stop_event = asyncio.Event() + monitor_task: Optional[asyncio.Task] = None + sess: Optional[aiohttp.ClientSession] = None + + operation_id = _generate_operation_id(method, cfg.endpoint.path, attempt) + logging.debug("[DEBUG] HTTP %s %s (attempt %d)", method, url, attempt) + + payload_headers = {"Accept": "*/*"} + if not parsed_url.scheme and not parsed_url.netloc: # is URL relative? + payload_headers.update(get_auth_header(cfg.node_cls)) + if cfg.endpoint.headers: + payload_headers.update(cfg.endpoint.headers) + + payload_kw: dict[str, Any] = {"headers": payload_headers} + if method == "GET": + payload_headers.pop("Content-Type", None) + request_body_log = _snapshot_request_body_for_logging(cfg.content_type, method, cfg.data, cfg.files) + try: + if cfg.monitor_progress: + monitor_task = asyncio.create_task(_monitor(stop_event, start_time)) + + timeout = aiohttp.ClientTimeout(total=cfg.timeout) + sess = aiohttp.ClientSession(timeout=timeout) + + if cfg.content_type == "multipart/form-data" and method != "GET": + # aiohttp will set Content-Type boundary; remove any fixed Content-Type + payload_headers.pop("Content-Type", None) + if cfg.multipart_parser and cfg.data: + form = cfg.multipart_parser(cfg.data) + if not isinstance(form, aiohttp.FormData): + raise ValueError("multipart_parser must return aiohttp.FormData") + else: + form = aiohttp.FormData(default_to_multipart=True) + if cfg.data: + for k, v in cfg.data.items(): + if v is None: + continue + form.add_field(k, str(v) if not isinstance(v, (bytes, bytearray)) else v) + if cfg.files: + file_iter = cfg.files if isinstance(cfg.files, list) else cfg.files.items() + for field_name, file_obj in file_iter: + if file_obj is None: + continue + if isinstance(file_obj, tuple): + filename, file_value, content_type = _unpack_tuple(file_obj) + else: + filename = getattr(file_obj, "name", field_name) + file_value = file_obj + content_type = "application/octet-stream" + # Attempt to rewind BytesIO for retries + if isinstance(file_value, BytesIO): + with contextlib.suppress(Exception): + file_value.seek(0) + form.add_field(field_name, file_value, filename=filename, content_type=content_type) + payload_kw["data"] = form + elif cfg.content_type == "application/x-www-form-urlencoded" and method != "GET": + payload_headers["Content-Type"] = "application/x-www-form-urlencoded" + payload_kw["data"] = cfg.data or {} + elif method != "GET": + payload_headers["Content-Type"] = "application/json" + payload_kw["json"] = cfg.data or {} + + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + request_headers=dict(payload_headers) if payload_headers else None, + request_params=dict(params) if params else None, + request_data=request_body_log, + ) + except Exception as _log_e: + logging.debug("[DEBUG] request logging failed: %s", _log_e) + + req_coro = sess.request(method, url, params=params, **payload_kw) + req_task = asyncio.create_task(req_coro) + + # Race: request vs. monitor (interruption) + tasks = {req_task} + if monitor_task: + tasks.add(monitor_task) + done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + + if monitor_task and monitor_task in done: + # Interrupted – cancel the request and abort + if req_task in pending: + req_task.cancel() + raise ProcessingInterrupted("Task cancelled") + + # Otherwise, request finished + resp = await req_task + async with resp: + if resp.status >= 400: + try: + body = await resp.json() + except (ContentTypeError, json.JSONDecodeError): + body = await resp.text() + if resp.status in _RETRY_STATUS and attempt <= cfg.max_retries: + logging.warning( + "HTTP %s %s -> %s. Retrying in %.2fs (retry %d of %d).", + method, + url, + resp.status, + delay, + attempt, + cfg.max_retries, + ) + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=body, + error_message=_friendly_http_message(resp.status, body), + ) + except Exception as _log_e: + logging.debug("[DEBUG] response logging failed: %s", _log_e) + + await sleep_with_interrupt( + delay, + cfg.node_cls, + cfg.wait_label if cfg.monitor_progress else None, + start_time if cfg.monitor_progress else None, + cfg.estimated_total, + display_callback=_display_time_progress if cfg.monitor_progress else None, + ) + delay *= cfg.retry_backoff + continue + msg = _friendly_http_message(resp.status, body) + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=body, + error_message=msg, + ) + except Exception as _log_e: + logging.debug("[DEBUG] response logging failed: %s", _log_e) + raise Exception(msg) + + if expect_binary: + buff = bytearray() + last_tick = time.monotonic() + async for chunk in resp.content.iter_chunked(64 * 1024): + buff.extend(chunk) + now = time.monotonic() + if now - last_tick >= 1.0: + last_tick = now + if is_processing_interrupted(): + raise ProcessingInterrupted("Task cancelled") + if cfg.monitor_progress: + _display_time_progress( + cfg.node_cls, cfg.wait_label, int(now - start_time), cfg.estimated_total + ) + bytes_payload = bytes(buff) + operation_succeeded = True + final_elapsed_seconds = int(time.monotonic() - start_time) + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=bytes_payload, + ) + except Exception as _log_e: + logging.debug("[DEBUG] response logging failed: %s", _log_e) + return bytes_payload + else: + try: + payload = await resp.json() + response_content_to_log: Any = payload + except (ContentTypeError, json.JSONDecodeError): + text = await resp.text() + try: + payload = json.loads(text) if text else {} + except json.JSONDecodeError: + payload = {"_raw": text} + response_content_to_log = payload if isinstance(payload, dict) else text + operation_succeeded = True + final_elapsed_seconds = int(time.monotonic() - start_time) + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=response_content_to_log, + ) + except Exception as _log_e: + logging.debug("[DEBUG] response logging failed: %s", _log_e) + return payload + + except ProcessingInterrupted: + logging.debug("Polling was interrupted by user") + raise + except (ClientError, asyncio.TimeoutError, socket.gaierror) as e: + if attempt <= cfg.max_retries: + logging.warning( + "Connection error calling %s %s. Retrying in %.2fs (%d/%d): %s", + method, + url, + delay, + attempt, + cfg.max_retries, + str(e), + ) + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + request_headers=dict(payload_headers) if payload_headers else None, + request_params=dict(params) if params else None, + request_data=request_body_log, + error_message=f"{type(e).__name__}: {str(e)} (will retry)", + ) + except Exception as _log_e: + logging.debug("[DEBUG] request error logging failed: %s", _log_e) + await sleep_with_interrupt( + delay, + cfg.node_cls, + cfg.wait_label if cfg.monitor_progress else None, + start_time if cfg.monitor_progress else None, + cfg.estimated_total, + display_callback=_display_time_progress if cfg.monitor_progress else None, + ) + delay *= cfg.retry_backoff + continue + diag = await _diagnose_connectivity() + if diag.get("is_local_issue"): + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + request_headers=dict(payload_headers) if payload_headers else None, + request_params=dict(params) if params else None, + request_data=request_body_log, + error_message=f"LocalNetworkError: {str(e)}", + ) + except Exception as _log_e: + logging.debug("[DEBUG] final error logging failed: %s", _log_e) + raise LocalNetworkError( + "Unable to connect to the API server due to local network issues. " + "Please check your internet connection and try again." + ) from e + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + request_headers=dict(payload_headers) if payload_headers else None, + request_params=dict(params) if params else None, + request_data=request_body_log, + error_message=f"ApiServerError: {str(e)}", + ) + except Exception as _log_e: + logging.debug("[DEBUG] final error logging failed: %s", _log_e) + raise ApiServerError( + f"The API server at {default_base_url()} is currently unreachable. " + f"The service may be experiencing issues." + ) from e + finally: + stop_event.set() + if monitor_task: + monitor_task.cancel() + with contextlib.suppress(Exception): + await monitor_task + if sess: + with contextlib.suppress(Exception): + await sess.close() + if operation_succeeded and cfg.monitor_progress and cfg.final_label_on_success: + _display_time_progress( + cfg.node_cls, + status=cfg.final_label_on_success, + elapsed_seconds=( + final_elapsed_seconds + if final_elapsed_seconds is not None + else int(time.monotonic() - start_time) + ), + estimated_total=cfg.estimated_total, + price=None, + is_queued=False, + processing_elapsed_seconds=final_elapsed_seconds, + ) + + +def _validate_or_raise(response_model: Type[M], payload: Any) -> M: + try: + return response_model.model_validate(payload) + except Exception as e: + logging.error( + "Response validation failed for %s: %s", + getattr(response_model, "__name__", response_model), + e, + ) + raise Exception( + f"Response validation failed for {getattr(response_model, '__name__', response_model)}: {e}" + ) from e + + +def _wrap_model_extractor( + response_model: Type[M], + extractor: Optional[Callable[[M], Any]], +) -> Optional[Callable[[dict[str, Any]], Any]]: + """Wrap a typed extractor so it can be used by the dict-based poller. + Validates the dict into `response_model` before invoking `extractor`. + Uses a small per-wrapper cache keyed by `id(dict)` to avoid re-validating + the same response for multiple extractors in a single poll attempt. + """ + if extractor is None: + return None + _cache: dict[int, M] = {} + + def _wrapped(d: dict[str, Any]) -> Any: + try: + key = id(d) + model = _cache.get(key) + if model is None: + model = response_model.model_validate(d) + _cache[key] = model + return extractor(model) + except Exception as e: + logging.error("Extractor failed (typed -> dict wrapper): %s", e) + raise + + return _wrapped + + +def _normalize_statuses(values: Optional[Iterable[Union[str, int]]]) -> set[Union[str, int]]: + if not values: + return set() + out: set[Union[str, int]] = set() + for v in values: + nv = _normalize_status_value(v) + if nv is not None: + out.add(nv) + return out + + +def _normalize_status_value(val: Union[str, int, None]) -> Union[str, int, None]: + if isinstance(val, str): + return val.strip().lower() + return val diff --git a/comfy_api_nodes/util/common_exceptions.py b/comfy_api_nodes/util/common_exceptions.py new file mode 100644 index 000000000..0606a4407 --- /dev/null +++ b/comfy_api_nodes/util/common_exceptions.py @@ -0,0 +1,14 @@ +class NetworkError(Exception): + """Base exception for network-related errors with diagnostic information.""" + + +class LocalNetworkError(NetworkError): + """Exception raised when local network connectivity issues are detected.""" + + +class ApiServerError(NetworkError): + """Exception raised when the API server is unreachable but internet is working.""" + + +class ProcessingInterrupted(Exception): + """Operation was interrupted by user/runtime via processing_interrupted().""" diff --git a/comfy_api_nodes/util/conversions.py b/comfy_api_nodes/util/conversions.py new file mode 100644 index 000000000..10cd1051b --- /dev/null +++ b/comfy_api_nodes/util/conversions.py @@ -0,0 +1,407 @@ +import base64 +import logging +import math +import uuid +from io import BytesIO +from typing import Optional + +import av +import numpy as np +import torch +from PIL import Image + +from comfy.utils import common_upscale +from comfy_api.latest import Input, InputImpl + +from ._helpers import mimetype_to_extension + + +def bytesio_to_image_tensor(image_bytesio: BytesIO, mode: str = "RGBA") -> torch.Tensor: + """Converts image data from BytesIO to a torch.Tensor. + + Args: + image_bytesio: BytesIO object containing the image data. + mode: The PIL mode to convert the image to (e.g., "RGB", "RGBA"). + + Returns: + A torch.Tensor representing the image (1, H, W, C). + + Raises: + PIL.UnidentifiedImageError: If the image data cannot be identified. + ValueError: If the specified mode is invalid. + """ + image = Image.open(image_bytesio) + image = image.convert(mode) + image_array = np.array(image).astype(np.float32) / 255.0 + return torch.from_numpy(image_array).unsqueeze(0) + + +def image_tensor_pair_to_batch(image1: torch.Tensor, image2: torch.Tensor) -> torch.Tensor: + """ + Converts a pair of image tensors to a batch tensor. + If the images are not the same size, the smaller image is resized to + match the larger image. + """ + if image1.shape[1:] != image2.shape[1:]: + image2 = common_upscale( + image2.movedim(-1, 1), + image1.shape[2], + image1.shape[1], + "bilinear", + "center", + ).movedim(1, -1) + return torch.cat((image1, image2), dim=0) + + +def tensor_to_bytesio( + image: torch.Tensor, + name: Optional[str] = None, + total_pixels: int = 2048 * 2048, + mime_type: str = "image/png", +) -> BytesIO: + """Converts a torch.Tensor image to a named BytesIO object. + + Args: + image: Input torch.Tensor image. + name: Optional filename for the BytesIO object. + total_pixels: Maximum total pixels for potential downscaling. + mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4'). + + Returns: + Named BytesIO object containing the image data, with pointer set to the start of buffer. + """ + if not mime_type: + mime_type = "image/png" + + pil_image = tensor_to_pil(image, total_pixels=total_pixels) + img_binary = pil_to_bytesio(pil_image, mime_type=mime_type) + img_binary.name = f"{name if name else uuid.uuid4()}.{mimetype_to_extension(mime_type)}" + return img_binary + + +def tensor_to_pil(image: torch.Tensor, total_pixels: int = 2048 * 2048) -> Image.Image: + """Converts a single torch.Tensor image [H, W, C] to a PIL Image, optionally downscaling.""" + if len(image.shape) > 3: + image = image[0] + # TODO: remove alpha if not allowed and present + input_tensor = image.cpu() + input_tensor = downscale_image_tensor(input_tensor.unsqueeze(0), total_pixels=total_pixels).squeeze() + image_np = (input_tensor.numpy() * 255).astype(np.uint8) + img = Image.fromarray(image_np) + return img + + +def tensor_to_base64_string( + image_tensor: torch.Tensor, + total_pixels: int = 2048 * 2048, + mime_type: str = "image/png", +) -> str: + """Convert [B, H, W, C] or [H, W, C] tensor to a base64 string. + + Args: + image_tensor: Input torch.Tensor image. + total_pixels: Maximum total pixels for potential downscaling. + mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4'). + + Returns: + Base64 encoded string of the image. + """ + pil_image = tensor_to_pil(image_tensor, total_pixels=total_pixels) + img_byte_arr = pil_to_bytesio(pil_image, mime_type=mime_type) + img_bytes = img_byte_arr.getvalue() + # Encode bytes to base64 string + base64_encoded_string = base64.b64encode(img_bytes).decode("utf-8") + return base64_encoded_string + + +def pil_to_bytesio(img: Image.Image, mime_type: str = "image/png") -> BytesIO: + """Converts a PIL Image to a BytesIO object.""" + if not mime_type: + mime_type = "image/png" + + img_byte_arr = BytesIO() + # Derive PIL format from MIME type (e.g., 'image/png' -> 'PNG') + pil_format = mime_type.split("/")[-1].upper() + if pil_format == "JPG": + pil_format = "JPEG" + img.save(img_byte_arr, format=pil_format) + img_byte_arr.seek(0) + return img_byte_arr + + +def downscale_image_tensor(image, total_pixels=1536 * 1024) -> torch.Tensor: + """Downscale input image tensor to roughly the specified total pixels.""" + samples = image.movedim(-1, 1) + total = int(total_pixels) + scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) + if scale_by >= 1: + return image + width = round(samples.shape[3] * scale_by) + height = round(samples.shape[2] * scale_by) + + s = common_upscale(samples, width, height, "lanczos", "disabled") + s = s.movedim(1, -1) + return s + + +def tensor_to_data_uri( + image_tensor: torch.Tensor, + total_pixels: int = 2048 * 2048, + mime_type: str = "image/png", +) -> str: + """Converts a tensor image to a Data URI string. + + Args: + image_tensor: Input torch.Tensor image. + total_pixels: Maximum total pixels for potential downscaling. + mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp'). + + Returns: + Data URI string (e.g., 'data:image/png;base64,...'). + """ + base64_string = tensor_to_base64_string(image_tensor, total_pixels, mime_type) + return f"data:{mime_type};base64,{base64_string}" + + +def audio_to_base64_string(audio: Input.Audio, container_format: str = "mp4", codec_name: str = "aac") -> str: + """Converts an audio input to a base64 string.""" + sample_rate: int = audio["sample_rate"] + waveform: torch.Tensor = audio["waveform"] + audio_data_np = audio_tensor_to_contiguous_ndarray(waveform) + audio_bytes_io = audio_ndarray_to_bytesio(audio_data_np, sample_rate, container_format, codec_name) + audio_bytes = audio_bytes_io.getvalue() + return base64.b64encode(audio_bytes).decode("utf-8") + + +def audio_ndarray_to_bytesio( + audio_data_np: np.ndarray, + sample_rate: int, + container_format: str = "mp4", + codec_name: str = "aac", +) -> BytesIO: + """ + Encodes a numpy array of audio data into a BytesIO object. + """ + audio_bytes_io = BytesIO() + with av.open(audio_bytes_io, mode="w", format=container_format) as output_container: + audio_stream = output_container.add_stream(codec_name, rate=sample_rate) + frame = av.AudioFrame.from_ndarray( + audio_data_np, + format="fltp", + layout="stereo" if audio_data_np.shape[0] > 1 else "mono", + ) + frame.sample_rate = sample_rate + frame.pts = 0 + + for packet in audio_stream.encode(frame): + output_container.mux(packet) + + # Flush stream + for packet in audio_stream.encode(None): + output_container.mux(packet) + + audio_bytes_io.seek(0) + return audio_bytes_io + + +def audio_tensor_to_contiguous_ndarray(waveform: torch.Tensor) -> np.ndarray: + """ + Prepares audio waveform for av library by converting to a contiguous numpy array. + + Args: + waveform: a tensor of shape (1, channels, samples) derived from a Comfy `AUDIO` type. + + Returns: + Contiguous numpy array of the audio waveform. If the audio was batched, + the first item is taken. + """ + if waveform.ndim != 3 or waveform.shape[0] != 1: + raise ValueError("Expected waveform tensor shape (1, channels, samples)") + + # If batch is > 1, take first item + if waveform.shape[0] > 1: + waveform = waveform[0] + + # Prepare for av: remove batch dim, move to CPU, make contiguous, convert to numpy array + audio_data_np = waveform.squeeze(0).cpu().contiguous().numpy() + if audio_data_np.dtype != np.float32: + audio_data_np = audio_data_np.astype(np.float32) + + return audio_data_np + + +def audio_input_to_mp3(audio: Input.Audio) -> BytesIO: + waveform = audio["waveform"].cpu() + + output_buffer = BytesIO() + output_container = av.open(output_buffer, mode="w", format="mp3") + + out_stream = output_container.add_stream("libmp3lame", rate=audio["sample_rate"]) + out_stream.bit_rate = 320000 + + frame = av.AudioFrame.from_ndarray( + waveform.movedim(0, 1).reshape(1, -1).float().numpy(), + format="flt", + layout="mono" if waveform.shape[0] == 1 else "stereo", + ) + frame.sample_rate = audio["sample_rate"] + frame.pts = 0 + output_container.mux(out_stream.encode(frame)) + output_container.mux(out_stream.encode(None)) + output_container.close() + output_buffer.seek(0) + return output_buffer + + +def trim_video(video: Input.Video, duration_sec: float) -> Input.Video: + """ + Returns a new VideoInput object trimmed from the beginning to the specified duration, + using av to avoid loading entire video into memory. + + Args: + video: Input video to trim + duration_sec: Duration in seconds to keep from the beginning + + Returns: + VideoFromFile object that owns the output buffer + """ + output_buffer = BytesIO() + input_container = None + output_container = None + + try: + # Get the stream source - this avoids loading entire video into memory + # when the source is already a file path + input_source = video.get_stream_source() + + # Open containers + input_container = av.open(input_source, mode="r") + output_container = av.open(output_buffer, mode="w", format="mp4") + + # Set up output streams for re-encoding + video_stream = None + audio_stream = None + + for stream in input_container.streams: + logging.info("Found stream: type=%s, class=%s", stream.type, type(stream)) + if isinstance(stream, av.VideoStream): + # Create output video stream with same parameters + video_stream = output_container.add_stream("h264", rate=stream.average_rate) + video_stream.width = stream.width + video_stream.height = stream.height + video_stream.pix_fmt = "yuv420p" + logging.info("Added video stream: %sx%s @ %sfps", stream.width, stream.height, stream.average_rate) + elif isinstance(stream, av.AudioStream): + # Create output audio stream with same parameters + audio_stream = output_container.add_stream("aac", rate=stream.sample_rate) + audio_stream.sample_rate = stream.sample_rate + audio_stream.layout = stream.layout + logging.info("Added audio stream: %sHz, %s channels", stream.sample_rate, stream.channels) + + # Calculate target frame count that's divisible by 16 + fps = input_container.streams.video[0].average_rate + estimated_frames = int(duration_sec * fps) + target_frames = (estimated_frames // 16) * 16 # Round down to nearest multiple of 16 + + if target_frames == 0: + raise ValueError("Video too short: need at least 16 frames for Moonvalley") + + frame_count = 0 + audio_frame_count = 0 + + # Decode and re-encode video frames + if video_stream: + for frame in input_container.decode(video=0): + if frame_count >= target_frames: + break + + # Re-encode frame + for packet in video_stream.encode(frame): + output_container.mux(packet) + frame_count += 1 + + # Flush encoder + for packet in video_stream.encode(): + output_container.mux(packet) + + logging.info("Encoded %s video frames (target: %s)", frame_count, target_frames) + + # Decode and re-encode audio frames + if audio_stream: + input_container.seek(0) # Reset to beginning for audio + for frame in input_container.decode(audio=0): + if frame.time >= duration_sec: + break + + # Re-encode frame + for packet in audio_stream.encode(frame): + output_container.mux(packet) + audio_frame_count += 1 + + # Flush encoder + for packet in audio_stream.encode(): + output_container.mux(packet) + + logging.info("Encoded %s audio frames", audio_frame_count) + + # Close containers + output_container.close() + input_container.close() + + # Return as VideoFromFile using the buffer + output_buffer.seek(0) + return InputImpl.VideoFromFile(output_buffer) + + except Exception as e: + # Clean up on error + if input_container is not None: + input_container.close() + if output_container is not None: + output_container.close() + raise RuntimeError(f"Failed to trim video: {str(e)}") from e + + +def _f32_pcm(wav: torch.Tensor) -> torch.Tensor: + """Convert audio to float 32 bits PCM format. Copy-paste from nodes_audio.py file.""" + if wav.dtype.is_floating_point: + return wav + elif wav.dtype == torch.int16: + return wav.float() / (2**15) + elif wav.dtype == torch.int32: + return wav.float() / (2**31) + raise ValueError(f"Unsupported wav dtype: {wav.dtype}") + + +def audio_bytes_to_audio_input(audio_bytes: bytes) -> dict: + """ + Decode any common audio container from bytes using PyAV and return + a Comfy AUDIO dict: {"waveform": [1, C, T] float32, "sample_rate": int}. + """ + with av.open(BytesIO(audio_bytes)) as af: + if not af.streams.audio: + raise ValueError("No audio stream found in response.") + stream = af.streams.audio[0] + + in_sr = int(stream.codec_context.sample_rate) + out_sr = in_sr + + frames: list[torch.Tensor] = [] + n_channels = stream.channels or 1 + + for frame in af.decode(streams=stream.index): + arr = frame.to_ndarray() # shape can be [C, T] or [T, C] or [T] + buf = torch.from_numpy(arr) + if buf.ndim == 1: + buf = buf.unsqueeze(0) # [T] -> [1, T] + elif buf.shape[0] != n_channels and buf.shape[-1] == n_channels: + buf = buf.transpose(0, 1).contiguous() # [T, C] -> [C, T] + elif buf.shape[0] != n_channels: + buf = buf.reshape(-1, n_channels).t().contiguous() # fallback to [C, T] + frames.append(buf) + + if not frames: + raise ValueError("Decoded zero audio frames.") + + wav = torch.cat(frames, dim=1) # [C, T] + wav = _f32_pcm(wav) + return {"waveform": wav.unsqueeze(0).contiguous(), "sample_rate": out_sr} diff --git a/comfy_api_nodes/util/download_helpers.py b/comfy_api_nodes/util/download_helpers.py new file mode 100644 index 000000000..055e690de --- /dev/null +++ b/comfy_api_nodes/util/download_helpers.py @@ -0,0 +1,249 @@ +import asyncio +import contextlib +import uuid +from io import BytesIO +from pathlib import Path +from typing import IO, Optional, Union +from urllib.parse import urljoin, urlparse + +import aiohttp +import torch +from aiohttp.client_exceptions import ClientError, ContentTypeError + +from comfy_api.input_impl import VideoFromFile +from comfy_api.latest import IO as COMFY_IO +from comfy_api_nodes.apis import request_logger + +from ._helpers import ( + default_base_url, + get_auth_header, + is_processing_interrupted, + sleep_with_interrupt, +) +from .client import _diagnose_connectivity +from .common_exceptions import ApiServerError, LocalNetworkError, ProcessingInterrupted +from .conversions import bytesio_to_image_tensor + +_RETRY_STATUS = {408, 429, 500, 502, 503, 504} + + +async def download_url_to_bytesio( + url: str, + dest: Optional[Union[BytesIO, IO[bytes], str, Path]], + *, + timeout: Optional[float] = None, + max_retries: int = 3, + retry_delay: float = 1.0, + retry_backoff: float = 2.0, + cls: type[COMFY_IO.ComfyNode] = None, +) -> None: + """Stream-download a URL to `dest`. + + `dest` must be one of: + - a BytesIO (rewound to 0 after write), + - a file-like object opened in binary write mode (must implement .write()), + - a filesystem path (str | pathlib.Path), which will be opened with 'wb'. + + If `url` starts with `/proxy/`, `cls` must be provided so the URL can be expanded + to an absolute URL and authentication headers can be applied. + + Raises: + ProcessingInterrupted, LocalNetworkError, ApiServerError, Exception (HTTP and other errors) + """ + if not isinstance(dest, (str, Path)) and not hasattr(dest, "write"): + raise ValueError("dest must be a path (str|Path) or a binary-writable object providing .write().") + + attempt = 0 + delay = retry_delay + headers: dict[str, str] = {} + + parsed_url = urlparse(url) + if not parsed_url.scheme and not parsed_url.netloc: # is URL relative? + if cls is None: + raise ValueError("For relative 'cloud' paths, the `cls` parameter is required.") + url = urljoin(default_base_url().rstrip("/") + "/", url.lstrip("/")) + headers = get_auth_header(cls) + + while True: + attempt += 1 + op_id = _generate_operation_id("GET", url, attempt) + timeout_cfg = aiohttp.ClientTimeout(total=timeout) + + is_path_sink = isinstance(dest, (str, Path)) + fhandle = None + session: Optional[aiohttp.ClientSession] = None + stop_evt: Optional[asyncio.Event] = None + monitor_task: Optional[asyncio.Task] = None + req_task: Optional[asyncio.Task] = None + + try: + with contextlib.suppress(Exception): + request_logger.log_request_response(operation_id=op_id, request_method="GET", request_url=url) + + session = aiohttp.ClientSession(timeout=timeout_cfg) + stop_evt = asyncio.Event() + + async def _monitor(): + try: + while not stop_evt.is_set(): + if is_processing_interrupted(): + return + await asyncio.sleep(1.0) + except asyncio.CancelledError: + return + + monitor_task = asyncio.create_task(_monitor()) + + req_task = asyncio.create_task(session.get(url, headers=headers)) + done, pending = await asyncio.wait({req_task, monitor_task}, return_when=asyncio.FIRST_COMPLETED) + + if monitor_task in done and req_task in pending: + req_task.cancel() + with contextlib.suppress(Exception): + await req_task + raise ProcessingInterrupted("Task cancelled") + + try: + resp = await req_task + except asyncio.CancelledError: + raise ProcessingInterrupted("Task cancelled") from None + + async with resp: + if resp.status >= 400: + with contextlib.suppress(Exception): + try: + body = await resp.json() + except (ContentTypeError, ValueError): + text = await resp.text() + body = text if len(text) <= 4096 else f"[text {len(text)} bytes]" + request_logger.log_request_response( + operation_id=op_id, + request_method="GET", + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=body, + error_message=f"HTTP {resp.status}", + ) + + if resp.status in _RETRY_STATUS and attempt <= max_retries: + await sleep_with_interrupt(delay, cls, None, None, None) + delay *= retry_backoff + continue + raise Exception(f"Failed to download (HTTP {resp.status}).") + + if is_path_sink: + p = Path(str(dest)) + with contextlib.suppress(Exception): + p.parent.mkdir(parents=True, exist_ok=True) + fhandle = open(p, "wb") + sink = fhandle + else: + sink = dest # BytesIO or file-like + + written = 0 + while True: + try: + chunk = await asyncio.wait_for(resp.content.read(1024 * 1024), timeout=1.0) + except asyncio.TimeoutError: + chunk = b"" + except asyncio.CancelledError: + raise ProcessingInterrupted("Task cancelled") from None + + if is_processing_interrupted(): + raise ProcessingInterrupted("Task cancelled") + + if not chunk: + if resp.content.at_eof(): + break + continue + + sink.write(chunk) + written += len(chunk) + + if isinstance(dest, BytesIO): + with contextlib.suppress(Exception): + dest.seek(0) + + with contextlib.suppress(Exception): + request_logger.log_request_response( + operation_id=op_id, + request_method="GET", + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=f"[streamed {written} bytes to dest]", + ) + return + except asyncio.CancelledError: + raise ProcessingInterrupted("Task cancelled") from None + except (ClientError, asyncio.TimeoutError) as e: + if attempt <= max_retries: + with contextlib.suppress(Exception): + request_logger.log_request_response( + operation_id=op_id, + request_method="GET", + request_url=url, + error_message=f"{type(e).__name__}: {str(e)} (will retry)", + ) + await sleep_with_interrupt(delay, cls, None, None, None) + delay *= retry_backoff + continue + + diag = await _diagnose_connectivity() + if diag.get("is_local_issue"): + raise LocalNetworkError( + "Unable to connect to the network. Please check your internet connection and try again." + ) from e + raise ApiServerError("The remote service appears unreachable at this time.") from e + finally: + if stop_evt is not None: + stop_evt.set() + if monitor_task: + monitor_task.cancel() + with contextlib.suppress(Exception): + await monitor_task + if req_task and not req_task.done(): + req_task.cancel() + with contextlib.suppress(Exception): + await req_task + if session: + with contextlib.suppress(Exception): + await session.close() + if fhandle: + with contextlib.suppress(Exception): + fhandle.flush() + fhandle.close() + + +async def download_url_to_image_tensor( + url: str, + *, + timeout: float = None, + cls: type[COMFY_IO.ComfyNode] = None, +) -> torch.Tensor: + """Downloads an image from a URL and returns a [B, H, W, C] tensor.""" + result = BytesIO() + await download_url_to_bytesio(url, result, timeout=timeout, cls=cls) + return bytesio_to_image_tensor(result) + + +async def download_url_to_video_output( + video_url: str, + *, + timeout: float = None, + cls: type[COMFY_IO.ComfyNode] = None, +) -> VideoFromFile: + """Downloads a video from a URL and returns a `VIDEO` output.""" + result = BytesIO() + await download_url_to_bytesio(video_url, result, timeout=timeout, cls=cls) + return VideoFromFile(result) + + +def _generate_operation_id(method: str, url: str, attempt: int) -> str: + try: + parsed = urlparse(url) + slug = (parsed.path.rsplit("/", 1)[-1] or parsed.netloc or "download").strip("/").replace("/", "_") + except Exception: + slug = "download" + return f"{method}_{slug}_try{attempt}_{uuid.uuid4().hex[:8]}" diff --git a/comfy_api_nodes/util/upload_helpers.py b/comfy_api_nodes/util/upload_helpers.py new file mode 100644 index 000000000..a345d451d --- /dev/null +++ b/comfy_api_nodes/util/upload_helpers.py @@ -0,0 +1,338 @@ +import asyncio +import contextlib +import logging +import time +import uuid +from io import BytesIO +from typing import Optional, Union +from urllib.parse import urlparse + +import aiohttp +import torch +from pydantic import BaseModel, Field + +from comfy_api.latest import IO, Input +from comfy_api.util import VideoCodec, VideoContainer +from comfy_api_nodes.apis import request_logger + +from ._helpers import is_processing_interrupted, sleep_with_interrupt +from .client import ( + ApiEndpoint, + _diagnose_connectivity, + _display_time_progress, + sync_op, +) +from .common_exceptions import ApiServerError, LocalNetworkError, ProcessingInterrupted +from .conversions import ( + audio_ndarray_to_bytesio, + audio_tensor_to_contiguous_ndarray, + tensor_to_bytesio, +) + + +class UploadRequest(BaseModel): + file_name: str = Field(..., description="Filename to upload") + content_type: Optional[str] = Field( + None, + description="Mime type of the file. For example: image/png, image/jpeg, video/mp4, etc.", + ) + + +class UploadResponse(BaseModel): + download_url: str = Field(..., description="URL to GET uploaded file") + upload_url: str = Field(..., description="URL to PUT file to upload") + + +async def upload_images_to_comfyapi( + cls: type[IO.ComfyNode], + image: torch.Tensor, + *, + max_images: int = 8, + mime_type: Optional[str] = None, + wait_label: Optional[str] = "Uploading", +) -> list[str]: + """ + Uploads images to ComfyUI API and returns download URLs. + To upload multiple images, stack them in the batch dimension first. + """ + # if batch, try to upload each file if max_images is greater than 0 + download_urls: list[str] = [] + is_batch = len(image.shape) > 3 + batch_len = image.shape[0] if is_batch else 1 + + for idx in range(min(batch_len, max_images)): + tensor = image[idx] if is_batch else image + img_io = tensor_to_bytesio(tensor, mime_type=mime_type) + url = await upload_file_to_comfyapi(cls, img_io, img_io.name, mime_type, wait_label) + download_urls.append(url) + return download_urls + + +async def upload_audio_to_comfyapi( + cls: type[IO.ComfyNode], + audio: Input.Audio, + *, + container_format: str = "mp4", + codec_name: str = "aac", + mime_type: str = "audio/mp4", + filename: str = "uploaded_audio.mp4", +) -> str: + """ + Uploads a single audio input to ComfyUI API and returns its download URL. + Encodes the raw waveform into the specified format before uploading. + """ + sample_rate: int = audio["sample_rate"] + waveform: torch.Tensor = audio["waveform"] + audio_data_np = audio_tensor_to_contiguous_ndarray(waveform) + audio_bytes_io = audio_ndarray_to_bytesio(audio_data_np, sample_rate, container_format, codec_name) + return await upload_file_to_comfyapi(cls, audio_bytes_io, filename, mime_type) + + +async def upload_video_to_comfyapi( + cls: type[IO.ComfyNode], + video: Input.Video, + *, + container: VideoContainer = VideoContainer.MP4, + codec: VideoCodec = VideoCodec.H264, + max_duration: Optional[int] = None, +) -> str: + """ + Uploads a single video to ComfyUI API and returns its download URL. + Uses the specified container and codec for saving the video before upload. + """ + if max_duration is not None: + try: + actual_duration = video.get_duration() + if actual_duration > max_duration: + raise ValueError( + f"Video duration ({actual_duration:.2f}s) exceeds the maximum allowed ({max_duration}s)." + ) + except Exception as e: + logging.error("Error getting video duration: %s", str(e)) + raise ValueError(f"Could not verify video duration from source: {e}") from e + + upload_mime_type = f"video/{container.value.lower()}" + filename = f"uploaded_video.{container.value.lower()}" + + # Convert VideoInput to BytesIO using specified container/codec + video_bytes_io = BytesIO() + video.save_to(video_bytes_io, format=container, codec=codec) + video_bytes_io.seek(0) + + return await upload_file_to_comfyapi(cls, video_bytes_io, filename, upload_mime_type) + + +async def upload_file_to_comfyapi( + cls: type[IO.ComfyNode], + file_bytes_io: BytesIO, + filename: str, + upload_mime_type: Optional[str], + wait_label: Optional[str] = "Uploading", +) -> str: + """Uploads a single file to ComfyUI API and returns its download URL.""" + if upload_mime_type is None: + request_object = UploadRequest(file_name=filename) + else: + request_object = UploadRequest(file_name=filename, content_type=upload_mime_type) + create_resp = await sync_op( + cls, + endpoint=ApiEndpoint(path="/customers/storage", method="POST"), + data=request_object, + response_model=UploadResponse, + final_label_on_success=None, + monitor_progress=False, + ) + await upload_file( + cls, + create_resp.upload_url, + file_bytes_io, + content_type=upload_mime_type, + wait_label=wait_label, + ) + return create_resp.download_url + + +async def upload_file( + cls: type[IO.ComfyNode], + upload_url: str, + file: Union[BytesIO, str], + *, + content_type: Optional[str] = None, + max_retries: int = 3, + retry_delay: float = 1.0, + retry_backoff: float = 2.0, + wait_label: Optional[str] = None, +) -> None: + """ + Upload a file to a signed URL (e.g., S3 pre-signed PUT) with retries, Comfy progress display, and interruption. + + Args: + cls: Node class (provides auth context + UI progress hooks). + upload_url: Pre-signed PUT URL. + file: BytesIO or path string. + content_type: Explicit MIME type. If None, we *suppress* Content-Type. + max_retries: Maximum retry attempts. + retry_delay: Initial delay in seconds. + retry_backoff: Exponential backoff factor. + wait_label: Progress label shown in Comfy UI. + + Raises: + ProcessingInterrupted, LocalNetworkError, ApiServerError, Exception + """ + if isinstance(file, BytesIO): + with contextlib.suppress(Exception): + file.seek(0) + data = file.read() + elif isinstance(file, str): + with open(file, "rb") as f: + data = f.read() + else: + raise ValueError("file must be a BytesIO or a filesystem path string") + + headers: dict[str, str] = {} + skip_auto_headers: set[str] = set() + if content_type: + headers["Content-Type"] = content_type + else: + skip_auto_headers.add("Content-Type") # Don't let aiohttp add Content-Type, it can break the signed request + + attempt = 0 + delay = retry_delay + start_ts = time.monotonic() + op_uuid = uuid.uuid4().hex[:8] + while True: + attempt += 1 + operation_id = _generate_operation_id("PUT", upload_url, attempt, op_uuid) + timeout = aiohttp.ClientTimeout(total=None) + stop_evt = asyncio.Event() + + async def _monitor(): + try: + while not stop_evt.is_set(): + if is_processing_interrupted(): + return + if wait_label: + _display_time_progress(cls, wait_label, int(time.monotonic() - start_ts), None) + await asyncio.sleep(1.0) + except asyncio.CancelledError: + return + + monitor_task = asyncio.create_task(_monitor()) + sess: Optional[aiohttp.ClientSession] = None + try: + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + request_headers=headers or None, + request_params=None, + request_data=f"[File data {len(data)} bytes]", + ) + except Exception as e: + logging.debug("[DEBUG] upload request logging failed: %s", e) + + sess = aiohttp.ClientSession(timeout=timeout) + req = sess.put(upload_url, data=data, headers=headers, skip_auto_headers=skip_auto_headers) + req_task = asyncio.create_task(req) + + done, pending = await asyncio.wait({req_task, monitor_task}, return_when=asyncio.FIRST_COMPLETED) + + if monitor_task in done and req_task in pending: + req_task.cancel() + raise ProcessingInterrupted("Upload cancelled") + + try: + resp = await req_task + except asyncio.CancelledError: + raise ProcessingInterrupted("Upload cancelled") from None + + async with resp: + if resp.status >= 400: + with contextlib.suppress(Exception): + try: + body = await resp.json() + except Exception: + body = await resp.text() + msg = f"Upload failed with status {resp.status}" + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=body, + error_message=msg, + ) + if resp.status in {408, 429, 500, 502, 503, 504} and attempt <= max_retries: + await sleep_with_interrupt( + delay, + cls, + wait_label, + start_ts, + None, + display_callback=_display_time_progress if wait_label else None, + ) + delay *= retry_backoff + continue + raise Exception(f"Failed to upload (HTTP {resp.status}).") + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content="File uploaded successfully.", + ) + except Exception as e: + logging.debug("[DEBUG] upload response logging failed: %s", e) + return + except asyncio.CancelledError: + raise ProcessingInterrupted("Task cancelled") from None + except (aiohttp.ClientError, asyncio.TimeoutError) as e: + if attempt <= max_retries: + with contextlib.suppress(Exception): + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + request_headers=headers or None, + request_data=f"[File data {len(data)} bytes]", + error_message=f"{type(e).__name__}: {str(e)} (will retry)", + ) + await sleep_with_interrupt( + delay, + cls, + wait_label, + start_ts, + None, + display_callback=_display_time_progress if wait_label else None, + ) + delay *= retry_backoff + continue + + diag = await _diagnose_connectivity() + if diag.get("is_local_issue"): + raise LocalNetworkError( + "Unable to connect to the network. Please check your internet connection and try again." + ) from e + raise ApiServerError("The API service appears unreachable at this time.") from e + finally: + stop_evt.set() + if monitor_task: + monitor_task.cancel() + with contextlib.suppress(Exception): + await monitor_task + if sess: + with contextlib.suppress(Exception): + await sess.close() + + +def _generate_operation_id(method: str, url: str, attempt: int, op_uuid: str) -> str: + try: + parsed = urlparse(url) + slug = (parsed.path.rsplit("/", 1)[-1] or parsed.netloc or "upload").strip("/").replace("/", "_") + except Exception: + slug = "upload" + return f"{method}_{slug}_{op_uuid}_try{attempt}" diff --git a/comfy_api_nodes/util/validation_utils.py b/comfy_api_nodes/util/validation_utils.py index ca913e9b3..22da05bc1 100644 --- a/comfy_api_nodes/util/validation_utils.py +++ b/comfy_api_nodes/util/validation_utils.py @@ -2,6 +2,8 @@ import logging from typing import Optional import torch + +from comfy_api.input.video_types import VideoInput from comfy_api.latest import Input @@ -28,9 +30,7 @@ def validate_image_dimensions( if max_width is not None and width > max_width: raise ValueError(f"Image width must be at most {max_width}px, got {width}px") if min_height is not None and height < min_height: - raise ValueError( - f"Image height must be at least {min_height}px, got {height}px" - ) + raise ValueError(f"Image height must be at least {min_height}px, got {height}px") if max_height is not None and height > max_height: raise ValueError(f"Image height must be at most {max_height}px, got {height}px") @@ -44,13 +44,9 @@ def validate_image_aspect_ratio( aspect_ratio = width / height if min_aspect_ratio is not None and aspect_ratio < min_aspect_ratio: - raise ValueError( - f"Image aspect ratio must be at least {min_aspect_ratio}, got {aspect_ratio}" - ) + raise ValueError(f"Image aspect ratio must be at least {min_aspect_ratio}, got {aspect_ratio}") if max_aspect_ratio is not None and aspect_ratio > max_aspect_ratio: - raise ValueError( - f"Image aspect ratio must be at most {max_aspect_ratio}, got {aspect_ratio}" - ) + raise ValueError(f"Image aspect ratio must be at most {max_aspect_ratio}, got {aspect_ratio}") def validate_image_aspect_ratio_range( @@ -58,7 +54,7 @@ def validate_image_aspect_ratio_range( min_ratio: tuple[float, float], # e.g. (1, 4) max_ratio: tuple[float, float], # e.g. (4, 1) *, - strict: bool = True, # True -> (min, max); False -> [min, max] + strict: bool = True, # True -> (min, max); False -> [min, max] ) -> float: a1, b1 = min_ratio a2, b2 = max_ratio @@ -85,7 +81,7 @@ def validate_aspect_ratio_closeness( min_rel: float, max_rel: float, *, - strict: bool = False, # True => exclusive, False => inclusive + strict: bool = False, # True => exclusive, False => inclusive ) -> None: w1, h1 = get_image_dimensions(start_img) w2, h2 = get_image_dimensions(end_img) @@ -118,9 +114,7 @@ def validate_video_dimensions( if max_width is not None and width > max_width: raise ValueError(f"Video width must be at most {max_width}px, got {width}px") if min_height is not None and height < min_height: - raise ValueError( - f"Video height must be at least {min_height}px, got {height}px" - ) + raise ValueError(f"Video height must be at least {min_height}px, got {height}px") if max_height is not None and height > max_height: raise ValueError(f"Video height must be at most {max_height}px, got {height}px") @@ -138,13 +132,9 @@ def validate_video_duration( epsilon = 0.0001 if min_duration is not None and min_duration - epsilon > duration: - raise ValueError( - f"Video duration must be at least {min_duration}s, got {duration}s" - ) + raise ValueError(f"Video duration must be at least {min_duration}s, got {duration}s") if max_duration is not None and duration > max_duration + epsilon: - raise ValueError( - f"Video duration must be at most {max_duration}s, got {duration}s" - ) + raise ValueError(f"Video duration must be at most {max_duration}s, got {duration}s") def get_number_of_images(images): @@ -165,3 +155,31 @@ def validate_audio_duration( raise ValueError(f"Audio duration must be at least {min_duration}s, got {dur + eps:.2f}s") if max_duration is not None and dur - eps > max_duration: raise ValueError(f"Audio duration must be at most {max_duration}s, got {dur - eps:.2f}s") + + +def validate_string( + string: str, + strip_whitespace=True, + field_name="prompt", + min_length=None, + max_length=None, +): + if string is None: + raise Exception(f"Field '{field_name}' cannot be empty.") + if strip_whitespace: + string = string.strip() + if min_length and len(string) < min_length: + raise Exception( + f"Field '{field_name}' cannot be shorter than {min_length} characters; was {len(string)} characters long." + ) + if max_length and len(string) > max_length: + raise Exception( + f" Field '{field_name} cannot be longer than {max_length} characters; was {len(string)} characters long." + ) + + +def validate_container_format_is_mp4(video: VideoInput) -> None: + """Validates video container format is MP4.""" + container_format = video.get_container_format() + if container_format not in ["mp4", "mov,mp4,m4a,3gp,3g2,mj2"]: + raise ValueError(f"Only MP4 container format supported. Got: {container_format}") diff --git a/pyproject.toml b/pyproject.toml index 0c6b23a25..fcc4854a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,8 @@ messages_control.disable = [ "too-many-branches", "too-many-locals", "too-many-arguments", + "too-many-return-statements", + "too-many-nested-blocks", "duplicate-code", "abstract-method", "superfluous-parens", From dd5af0c5871376c377b2e30f9725b67a768eea6f Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 25 Oct 2025 01:48:34 +0300 Subject: [PATCH 485/759] convert Tripo API nodes to V3 schema (#10469) --- comfy_api_nodes/apis/tripo_api.py | 15 +- comfy_api_nodes/nodes_tripo.py | 892 ++++++++++++----------- comfy_api_nodes/util/__init__.py | 2 + comfy_api_nodes/util/download_helpers.py | 12 + 4 files changed, 503 insertions(+), 418 deletions(-) diff --git a/comfy_api_nodes/apis/tripo_api.py b/comfy_api_nodes/apis/tripo_api.py index 9f43d4d09..713260e2a 100644 --- a/comfy_api_nodes/apis/tripo_api.py +++ b/comfy_api_nodes/apis/tripo_api.py @@ -1,13 +1,20 @@ from __future__ import annotations -from comfy_api_nodes.apis import ( - TripoModelVersion, - TripoTextureQuality, -) from enum import Enum from typing import Optional, List, Dict, Any, Union from pydantic import BaseModel, Field, RootModel +class TripoModelVersion(str, Enum): + v2_5_20250123 = 'v2.5-20250123' + v2_0_20240919 = 'v2.0-20240919' + v1_4_20240625 = 'v1.4-20240625' + + +class TripoTextureQuality(str, Enum): + standard = 'standard' + detailed = 'detailed' + + class TripoStyle(str, Enum): PERSON_TO_CARTOON = "person:person2cartoon" ANIMAL_VENOM = "animal:venom" diff --git a/comfy_api_nodes/nodes_tripo.py b/comfy_api_nodes/nodes_tripo.py index d08cf9007..697100ff2 100644 --- a/comfy_api_nodes/nodes_tripo.py +++ b/comfy_api_nodes/nodes_tripo.py @@ -1,46 +1,39 @@ import os -from folder_paths import get_output_directory -from comfy_api_nodes.mapper_utils import model_field_to_node_input -from comfy.comfy_types.node_typing import IO -from comfy_api_nodes.apis import ( - TripoOrientation, - TripoModelVersion, -) +from typing import Optional + +import torch +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension from comfy_api_nodes.apis.tripo_api import ( - TripoTaskType, - TripoStyle, - TripoFileReference, + TripoAnimateRetargetRequest, + TripoAnimateRigRequest, + TripoConvertModelRequest, TripoFileEmptyReference, - TripoUrlReference, + TripoFileReference, + TripoImageToModelRequest, + TripoModelVersion, + TripoMultiviewToModelRequest, + TripoOrientation, + TripoRefineModelRequest, + TripoStyle, TripoTaskResponse, TripoTaskStatus, + TripoTaskType, TripoTextToModelRequest, - TripoImageToModelRequest, - TripoMultiviewToModelRequest, TripoTextureModelRequest, - TripoRefineModelRequest, - TripoAnimateRigRequest, - TripoAnimateRetargetRequest, - TripoConvertModelRequest, + TripoUrlReference, ) - -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( + download_url_as_bytesio, + poll_op, + sync_op, upload_images_to_comfyapi, - download_url_to_bytesio, ) +from folder_paths import get_output_directory -async def upload_image_to_tripo(image, **kwargs): - urls = await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs) - return TripoFileReference(TripoUrlReference(url=urls[0], type="jpeg")) - def get_model_url_from_response(response: TripoTaskResponse) -> str: if response.data is not None: for key in ["pbr_model", "model", "base_model"]: @@ -50,20 +43,18 @@ def get_model_url_from_response(response: TripoTaskResponse) -> str: async def poll_until_finished( - kwargs: dict[str, str], + node_cls: type[IO.ComfyNode], response: TripoTaskResponse, -) -> tuple[str, str]: + average_duration: Optional[int] = None, +) -> IO.NodeOutput: """Polls the Tripo API endpoint until the task reaches a terminal state, then returns the response.""" if response.code != 0: raise RuntimeError(f"Failed to generate mesh: {response.error}") task_id = response.data.task_id - response_poll = await PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/tripo/v2/openapi/task/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TripoTaskResponse, - ), + response_poll = await poll_op( + node_cls, + poll_endpoint=ApiEndpoint(path=f"/proxy/tripo/v2/openapi/task/{task_id}"), + response_model=TripoTaskResponse, completed_statuses=[TripoTaskStatus.SUCCESS], failed_statuses=[ TripoTaskStatus.FAILED, @@ -73,72 +64,84 @@ async def poll_until_finished( TripoTaskStatus.EXPIRED, ], status_extractor=lambda x: x.data.status, - auth_kwargs=kwargs, - node_id=kwargs["unique_id"], - result_url_extractor=get_model_url_from_response, progress_extractor=lambda x: x.data.progress, - ).execute() + estimated_duration=average_duration, + ) if response_poll.data.status == TripoTaskStatus.SUCCESS: url = get_model_url_from_response(response_poll) - bytesio = await download_url_to_bytesio(url) + bytesio = await download_url_as_bytesio(url) # Save the downloaded model file model_file = f"tripo_model_{task_id}.glb" with open(os.path.join(get_output_directory(), model_file), "wb") as f: f.write(bytesio.getvalue()) - return model_file, task_id + return IO.NodeOutput(model_file, task_id) raise RuntimeError(f"Failed to generate mesh: {response_poll}") -class TripoTextToModelNode: +class TripoTextToModelNode(IO.ComfyNode): """ Generates 3D models synchronously based on a text prompt using Tripo's API. """ - AVERAGE_DURATION = 80 + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ("STRING", {"multiline": True}), - }, - "optional": { - "negative_prompt": ("STRING", {"multiline": True}), - "model_version": model_field_to_node_input(IO.COMBO, TripoTextToModelRequest, "model_version", enum_type=TripoModelVersion), - "style": model_field_to_node_input(IO.COMBO, TripoTextToModelRequest, "style", enum_type=TripoStyle, default="None"), - "texture": ("BOOLEAN", {"default": True}), - "pbr": ("BOOLEAN", {"default": True}), - "image_seed": ("INT", {"default": 42}), - "model_seed": ("INT", {"default": 42}), - "texture_seed": ("INT", {"default": 42}), - "texture_quality": (["standard", "detailed"], {"default": "standard"}), - "face_limit": ("INT", {"min": -1, "max": 500000, "default": -1}), - "quad": ("BOOLEAN", {"default": False}) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TripoTextToModelNode", + display_name="Tripo: Text to Model", + category="api node/3d/Tripo", + inputs=[ + IO.String.Input("prompt", multiline=True), + IO.String.Input("negative_prompt", multiline=True, optional=True), + IO.Combo.Input( + "model_version", options=TripoModelVersion, default=TripoModelVersion.v2_5_20250123, optional=True + ), + IO.Combo.Input("style", options=TripoStyle, default="None", optional=True), + IO.Boolean.Input("texture", default=True, optional=True), + IO.Boolean.Input("pbr", default=True, optional=True), + IO.Int.Input("image_seed", default=42, optional=True), + IO.Int.Input("model_seed", default=42, optional=True), + IO.Int.Input("texture_seed", default=42, optional=True), + IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True), + IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True), + IO.Boolean.Input("quad", default=False, optional=True), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) - RETURN_TYPES = ("STRING", "MODEL_TASK_ID",) - RETURN_NAMES = ("model_file", "model task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - - async def generate_mesh(self, prompt, negative_prompt=None, model_version=None, style=None, texture=None, pbr=None, image_seed=None, model_seed=None, texture_seed=None, texture_quality=None, face_limit=None, quad=None, **kwargs): + @classmethod + async def execute( + cls, + prompt: str, + negative_prompt: Optional[str] = None, + model_version=None, + style: Optional[str] = None, + texture: Optional[bool] = None, + pbr: Optional[bool] = None, + image_seed: Optional[int] = None, + model_seed: Optional[int] = None, + texture_seed: Optional[int] = None, + texture_quality: Optional[str] = None, + face_limit: Optional[int] = None, + quad: Optional[bool] = None, + ) -> IO.NodeOutput: style_enum = None if style == "None" else style if not prompt: raise RuntimeError("Prompt is required") - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoTextToModelRequest, - response_model=TripoTaskResponse, - ), - request=TripoTextToModelRequest( + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoTextToModelRequest( type=TripoTaskType.TEXT_TO_MODEL, prompt=prompt, negative_prompt=negative_prompt if negative_prompt else None, @@ -152,64 +155,89 @@ class TripoTextToModelNode: texture_quality=texture_quality, face_limit=face_limit, auto_size=True, - quad=quad + quad=quad, ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + ) + return await poll_until_finished(cls, response, average_duration=80) -class TripoImageToModelNode: +class TripoImageToModelNode(IO.ComfyNode): """ Generates 3D models synchronously based on a single image using Tripo's API. """ - AVERAGE_DURATION = 80 + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - }, - "optional": { - "model_version": model_field_to_node_input(IO.COMBO, TripoImageToModelRequest, "model_version", enum_type=TripoModelVersion), - "style": model_field_to_node_input(IO.COMBO, TripoTextToModelRequest, "style", enum_type=TripoStyle, default="None"), - "texture": ("BOOLEAN", {"default": True}), - "pbr": ("BOOLEAN", {"default": True}), - "model_seed": ("INT", {"default": 42}), - "orientation": model_field_to_node_input(IO.COMBO, TripoImageToModelRequest, "orientation", enum_type=TripoOrientation), - "texture_seed": ("INT", {"default": 42}), - "texture_quality": (["standard", "detailed"], {"default": "standard"}), - "texture_alignment": (["original_image", "geometry"], {"default": "original_image"}), - "face_limit": ("INT", {"min": -1, "max": 500000, "default": -1}), - "quad": ("BOOLEAN", {"default": False}) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TripoImageToModelNode", + display_name="Tripo: Image to Model", + category="api node/3d/Tripo", + inputs=[ + IO.Image.Input("image"), + IO.Combo.Input( + "model_version", + options=TripoModelVersion, + tooltip="The model version to use for generation", + optional=True, + ), + IO.Combo.Input("style", options=TripoStyle, default="None", optional=True), + IO.Boolean.Input("texture", default=True, optional=True), + IO.Boolean.Input("pbr", default=True, optional=True), + IO.Int.Input("model_seed", default=42, optional=True), + IO.Combo.Input( + "orientation", options=TripoOrientation, default=TripoOrientation.DEFAULT, optional=True + ), + IO.Int.Input("texture_seed", default=42, optional=True), + IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True), + IO.Combo.Input( + "texture_alignment", default="original_image", options=["original_image", "geometry"], optional=True + ), + IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True), + IO.Boolean.Input("quad", default=False, optional=True), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) - RETURN_TYPES = ("STRING", "MODEL_TASK_ID",) - RETURN_NAMES = ("model_file", "model task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - - async def generate_mesh(self, image, model_version=None, style=None, texture=None, pbr=None, model_seed=None, orientation=None, texture_alignment=None, texture_seed=None, texture_quality=None, face_limit=None, quad=None, **kwargs): + @classmethod + async def execute( + cls, + image: torch.Tensor, + model_version: Optional[str] = None, + style: Optional[str] = None, + texture: Optional[bool] = None, + pbr: Optional[bool] = None, + model_seed: Optional[int] = None, + orientation=None, + texture_seed: Optional[int] = None, + texture_quality: Optional[str] = None, + texture_alignment: Optional[str] = None, + face_limit: Optional[int] = None, + quad: Optional[bool] = None, + ) -> IO.NodeOutput: style_enum = None if style == "None" else style if image is None: raise RuntimeError("Image is required") - tripo_file = await upload_image_to_tripo(image, **kwargs) - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoImageToModelRequest, - response_model=TripoTaskResponse, - ), - request=TripoImageToModelRequest( + tripo_file = TripoFileReference( + root=TripoUrlReference( + url=(await upload_images_to_comfyapi(cls, image, max_images=1))[0], + type="jpeg", + ) + ) + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoImageToModelRequest( type=TripoTaskType.IMAGE_TO_MODEL, file=tripo_file, model_version=model_version, @@ -223,80 +251,105 @@ class TripoImageToModelNode: texture_quality=texture_quality, face_limit=face_limit, auto_size=True, - quad=quad + quad=quad, ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + ) + return await poll_until_finished(cls, response, average_duration=80) -class TripoMultiviewToModelNode: +class TripoMultiviewToModelNode(IO.ComfyNode): """ Generates 3D models synchronously based on up to four images (front, left, back, right) using Tripo's API. """ - AVERAGE_DURATION = 80 + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - }, - "optional": { - "image_left": ("IMAGE",), - "image_back": ("IMAGE",), - "image_right": ("IMAGE",), - "model_version": model_field_to_node_input(IO.COMBO, TripoMultiviewToModelRequest, "model_version", enum_type=TripoModelVersion), - "orientation": model_field_to_node_input(IO.COMBO, TripoImageToModelRequest, "orientation", enum_type=TripoOrientation), - "texture": ("BOOLEAN", {"default": True}), - "pbr": ("BOOLEAN", {"default": True}), - "model_seed": ("INT", {"default": 42}), - "texture_seed": ("INT", {"default": 42}), - "texture_quality": (["standard", "detailed"], {"default": "standard"}), - "texture_alignment": (["original_image", "geometry"], {"default": "original_image"}), - "face_limit": ("INT", {"min": -1, "max": 500000, "default": -1}), - "quad": ("BOOLEAN", {"default": False}) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TripoMultiviewToModelNode", + display_name="Tripo: Multiview to Model", + category="api node/3d/Tripo", + inputs=[ + IO.Image.Input("image"), + IO.Image.Input("image_left", optional=True), + IO.Image.Input("image_back", optional=True), + IO.Image.Input("image_right", optional=True), + IO.Combo.Input( + "model_version", + options=TripoModelVersion, + optional=True, + tooltip="The model version to use for generation", + ), + IO.Combo.Input( + "orientation", + options=TripoOrientation, + default=TripoOrientation.DEFAULT, + optional=True, + ), + IO.Boolean.Input("texture", default=True, optional=True), + IO.Boolean.Input("pbr", default=True, optional=True), + IO.Int.Input("model_seed", default=42, optional=True), + IO.Int.Input("texture_seed", default=42, optional=True), + IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True), + IO.Combo.Input( + "texture_alignment", default="original_image", options=["original_image", "geometry"], optional=True + ), + IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True), + IO.Boolean.Input("quad", default=False, optional=True), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) - RETURN_TYPES = ("STRING", "MODEL_TASK_ID",) - RETURN_NAMES = ("model_file", "model task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - - async def generate_mesh(self, image, image_left=None, image_back=None, image_right=None, model_version=None, orientation=None, texture=None, pbr=None, model_seed=None, texture_seed=None, texture_quality=None, texture_alignment=None, face_limit=None, quad=None, **kwargs): + @classmethod + async def execute( + cls, + image: torch.Tensor, + image_left: Optional[torch.Tensor] = None, + image_back: Optional[torch.Tensor] = None, + image_right: Optional[torch.Tensor] = None, + model_version: Optional[str] = None, + orientation: Optional[str] = None, + texture: Optional[bool] = None, + pbr: Optional[bool] = None, + model_seed: Optional[int] = None, + texture_seed: Optional[int] = None, + texture_quality: Optional[str] = None, + texture_alignment: Optional[str] = None, + face_limit: Optional[int] = None, + quad: Optional[bool] = None, + ) -> IO.NodeOutput: if image is None: raise RuntimeError("front image for multiview is required") images = [] - image_dict = { - "image": image, - "image_left": image_left, - "image_back": image_back, - "image_right": image_right - } + image_dict = {"image": image, "image_left": image_left, "image_back": image_back, "image_right": image_right} if image_left is None and image_back is None and image_right is None: raise RuntimeError("At least one of left, back, or right image must be provided for multiview") for image_name in ["image", "image_left", "image_back", "image_right"]: image_ = image_dict[image_name] if image_ is not None: - tripo_file = await upload_image_to_tripo(image_, **kwargs) - images.append(tripo_file) + images.append( + TripoFileReference( + root=TripoUrlReference( + url=(await upload_images_to_comfyapi(cls, image_, max_images=1))[0], type="jpeg" + ) + ) + ) else: images.append(TripoFileEmptyReference()) - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoMultiviewToModelRequest, - response_model=TripoTaskResponse, - ), - request=TripoMultiviewToModelRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoMultiviewToModelRequest( type=TripoTaskType.MULTIVIEW_TO_MODEL, files=images, model_version=model_version, @@ -310,272 +363,283 @@ class TripoMultiviewToModelNode: face_limit=face_limit, quad=quad, ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + ) + return await poll_until_finished(cls, response, average_duration=80) -class TripoTextureNode: +class TripoTextureNode(IO.ComfyNode): + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model_task_id": ("MODEL_TASK_ID",), - }, - "optional": { - "texture": ("BOOLEAN", {"default": True}), - "pbr": ("BOOLEAN", {"default": True}), - "texture_seed": ("INT", {"default": 42}), - "texture_quality": (["standard", "detailed"], {"default": "standard"}), - "texture_alignment": (["original_image", "geometry"], {"default": "original_image"}), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TripoTextureNode", + display_name="Tripo: Texture model", + category="api node/3d/Tripo", + inputs=[ + IO.Custom("MODEL_TASK_ID").Input("model_task_id"), + IO.Boolean.Input("texture", default=True, optional=True), + IO.Boolean.Input("pbr", default=True, optional=True), + IO.Int.Input("texture_seed", default=42, optional=True), + IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True), + IO.Combo.Input( + "texture_alignment", default="original_image", options=["original_image", "geometry"], optional=True + ), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) - RETURN_TYPES = ("STRING", "MODEL_TASK_ID",) - RETURN_NAMES = ("model_file", "model task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - AVERAGE_DURATION = 80 - - async def generate_mesh(self, model_task_id, texture=None, pbr=None, texture_seed=None, texture_quality=None, texture_alignment=None, **kwargs): - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoTextureModelRequest, - response_model=TripoTaskResponse, - ), - request=TripoTextureModelRequest( + @classmethod + async def execute( + cls, + model_task_id, + texture: Optional[bool] = None, + pbr: Optional[bool] = None, + texture_seed: Optional[int] = None, + texture_quality: Optional[str] = None, + texture_alignment: Optional[str] = None, + ) -> IO.NodeOutput: + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoTextureModelRequest( original_model_task_id=model_task_id, texture=texture, pbr=pbr, texture_seed=texture_seed, texture_quality=texture_quality, - texture_alignment=texture_alignment + texture_alignment=texture_alignment, ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + ) + return await poll_until_finished(cls, response, average_duration=80) -class TripoRefineNode: +class TripoRefineNode(IO.ComfyNode): + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model_task_id": ("MODEL_TASK_ID", { - "tooltip": "Must be a v1.4 Tripo model" - }), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TripoRefineNode", + display_name="Tripo: Refine Draft model", + category="api node/3d/Tripo", + description="Refine a draft model created by v1.4 Tripo models only.", + inputs=[ + IO.Custom("MODEL_TASK_ID").Input("model_task_id", tooltip="Must be a v1.4 Tripo model"), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) - DESCRIPTION = "Refine a draft model created by v1.4 Tripo models only." - - RETURN_TYPES = ("STRING", "MODEL_TASK_ID",) - RETURN_NAMES = ("model_file", "model task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - AVERAGE_DURATION = 240 - - async def generate_mesh(self, model_task_id, **kwargs): - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoRefineModelRequest, - response_model=TripoTaskResponse, - ), - request=TripoRefineModelRequest( - draft_model_task_id=model_task_id - ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) - - -class TripoRigNode: @classmethod - def INPUT_TYPES(s): - return { - "required": { - "original_model_task_id": ("MODEL_TASK_ID",), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = ("STRING", "RIG_TASK_ID") - RETURN_NAMES = ("model_file", "rig task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - AVERAGE_DURATION = 180 - - async def generate_mesh(self, original_model_task_id, **kwargs): - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoAnimateRigRequest, - response_model=TripoTaskResponse, - ), - request=TripoAnimateRigRequest( - original_model_task_id=original_model_task_id, - out_format="glb", - spec="tripo" - ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + async def execute(cls, model_task_id) -> IO.NodeOutput: + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoRefineModelRequest(draft_model_task_id=model_task_id), + ) + return await poll_until_finished(cls, response, average_duration=240) -class TripoRetargetNode: +class TripoRigNode(IO.ComfyNode): + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "original_model_task_id": ("RIG_TASK_ID",), - "animation": ([ - "preset:idle", - "preset:walk", - "preset:climb", - "preset:jump", - "preset:slash", - "preset:shoot", - "preset:hurt", - "preset:fall", - "preset:turn", - ],), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TripoRigNode", + display_name="Tripo: Rig model", + category="api node/3d/Tripo", + inputs=[IO.Custom("MODEL_TASK_ID").Input("original_model_task_id")], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("RIG_TASK_ID").Output(display_name="rig task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) - RETURN_TYPES = ("STRING", "RETARGET_TASK_ID") - RETURN_NAMES = ("model_file", "retarget task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - AVERAGE_DURATION = 30 + @classmethod + async def execute(cls, original_model_task_id) -> IO.NodeOutput: + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoAnimateRigRequest(original_model_task_id=original_model_task_id, out_format="glb", spec="tripo"), + ) + return await poll_until_finished(cls, response, average_duration=180) - async def generate_mesh(self, animation, original_model_task_id, **kwargs): - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoAnimateRetargetRequest, - response_model=TripoTaskResponse, - ), - request=TripoAnimateRetargetRequest( + +class TripoRetargetNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="TripoRetargetNode", + display_name="Tripo: Retarget rigged model", + category="api node/3d/Tripo", + inputs=[ + IO.Custom("RIG_TASK_ID").Input("original_model_task_id"), + IO.Combo.Input( + "animation", + options=[ + "preset:idle", + "preset:walk", + "preset:climb", + "preset:jump", + "preset:slash", + "preset:shoot", + "preset:hurt", + "preset:fall", + "preset:turn", + ], + ), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("RETARGET_TASK_ID").Output(display_name="retarget task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) + + @classmethod + async def execute(cls, original_model_task_id, animation: str) -> IO.NodeOutput: + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoAnimateRetargetRequest( original_model_task_id=original_model_task_id, animation=animation, out_format="glb", - bake_animation=True + bake_animation=True, ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + ) + return await poll_until_finished(cls, response, average_duration=30) -class TripoConversionNode: - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "original_model_task_id": ("MODEL_TASK_ID,RIG_TASK_ID,RETARGET_TASK_ID",), - "format": (["GLTF", "USDZ", "FBX", "OBJ", "STL", "3MF"],), - }, - "optional": { - "quad": ("BOOLEAN", {"default": False}), - "face_limit": ("INT", {"min": -1, "max": 500000, "default": -1}), - "texture_size": ("INT", {"min": 128, "max": 4096, "default": 4096}), - "texture_format": (["BMP", "DPX", "HDR", "JPEG", "OPEN_EXR", "PNG", "TARGA", "TIFF", "WEBP"], {"default": "JPEG"}) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } +class TripoConversionNode(IO.ComfyNode): @classmethod - def VALIDATE_INPUTS(cls, input_types): + def define_schema(cls): + return IO.Schema( + node_id="TripoConversionNode", + display_name="Tripo: Convert model", + category="api node/3d/Tripo", + inputs=[ + IO.Custom("MODEL_TASK_ID,RIG_TASK_ID,RETARGET_TASK_ID").Input("original_model_task_id"), + IO.Combo.Input("format", options=["GLTF", "USDZ", "FBX", "OBJ", "STL", "3MF"]), + IO.Boolean.Input("quad", default=False, optional=True), + IO.Int.Input( + "face_limit", + default=-1, + min=-1, + max=500000, + optional=True, + ), + IO.Int.Input( + "texture_size", + default=4096, + min=128, + max=4096, + optional=True, + ), + IO.Combo.Input( + "texture_format", + options=["BMP", "DPX", "HDR", "JPEG", "OPEN_EXR", "PNG", "TARGA", "TIFF", "WEBP"], + default="JPEG", + optional=True, + ), + ], + outputs=[], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) + + @classmethod + def validate_inputs(cls, input_types): # The min and max of input1 and input2 are still validated because # we didn't take `input1` or `input2` as arguments if input_types["original_model_task_id"] not in ("MODEL_TASK_ID", "RIG_TASK_ID", "RETARGET_TASK_ID"): return "original_model_task_id must be MODEL_TASK_ID, RIG_TASK_ID or RETARGET_TASK_ID type" return True - RETURN_TYPES = () - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - AVERAGE_DURATION = 30 - - async def generate_mesh(self, original_model_task_id, format, quad, face_limit, texture_size, texture_format, **kwargs): + @classmethod + async def execute( + cls, + original_model_task_id, + format: str, + quad: bool, + face_limit: int, + texture_size: int, + texture_format: str, + ) -> IO.NodeOutput: if not original_model_task_id: raise RuntimeError("original_model_task_id is required") - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoConvertModelRequest, - response_model=TripoTaskResponse, - ), - request=TripoConvertModelRequest( + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoConvertModelRequest( original_model_task_id=original_model_task_id, format=format, quad=quad if quad else None, face_limit=face_limit if face_limit != -1 else None, texture_size=texture_size if texture_size != 4096 else None, - texture_format=texture_format if texture_format != "JPEG" else None + texture_format=texture_format if texture_format != "JPEG" else None, ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + ) + return await poll_until_finished(cls, response, average_duration=30) -NODE_CLASS_MAPPINGS = { - "TripoTextToModelNode": TripoTextToModelNode, - "TripoImageToModelNode": TripoImageToModelNode, - "TripoMultiviewToModelNode": TripoMultiviewToModelNode, - "TripoTextureNode": TripoTextureNode, - "TripoRefineNode": TripoRefineNode, - "TripoRigNode": TripoRigNode, - "TripoRetargetNode": TripoRetargetNode, - "TripoConversionNode": TripoConversionNode, -} +class TripoExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + TripoTextToModelNode, + TripoImageToModelNode, + TripoMultiviewToModelNode, + TripoTextureNode, + TripoRefineNode, + TripoRigNode, + TripoRetargetNode, + TripoConversionNode, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "TripoTextToModelNode": "Tripo: Text to Model", - "TripoImageToModelNode": "Tripo: Image to Model", - "TripoMultiviewToModelNode": "Tripo: Multiview to Model", - "TripoTextureNode": "Tripo: Texture model", - "TripoRefineNode": "Tripo: Refine Draft model", - "TripoRigNode": "Tripo: Rig model", - "TripoRetargetNode": "Tripo: Retarget rigged model", - "TripoConversionNode": "Tripo: Convert model", -} + +async def comfy_entrypoint() -> TripoExtension: + return TripoExtension() diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py index c2ec391aa..ab96760cb 100644 --- a/comfy_api_nodes/util/__init__.py +++ b/comfy_api_nodes/util/__init__.py @@ -20,6 +20,7 @@ from .conversions import ( trim_video, ) from .download_helpers import ( + download_url_as_bytesio, download_url_to_bytesio, download_url_to_image_tensor, download_url_to_video_output, @@ -56,6 +57,7 @@ __all__ = [ "upload_images_to_comfyapi", "upload_video_to_comfyapi", # Download helpers + "download_url_as_bytesio", "download_url_to_bytesio", "download_url_to_image_tensor", "download_url_to_video_output", diff --git a/comfy_api_nodes/util/download_helpers.py b/comfy_api_nodes/util/download_helpers.py index 055e690de..791dd5a50 100644 --- a/comfy_api_nodes/util/download_helpers.py +++ b/comfy_api_nodes/util/download_helpers.py @@ -240,6 +240,18 @@ async def download_url_to_video_output( return VideoFromFile(result) +async def download_url_as_bytesio( + url: str, + *, + timeout: float = None, + cls: type[COMFY_IO.ComfyNode] = None, +) -> BytesIO: + """Downloads content from a URL and returns a new BytesIO (rewound to 0).""" + result = BytesIO() + await download_url_to_bytesio(url, result, timeout=timeout, cls=cls) + return result + + def _generate_operation_id(method: str, url: str, attempt: int) -> str: try: parsed = urlparse(url) From 426cde37f10dc391f9601ab938e02c0faa42db14 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 24 Oct 2025 16:56:51 -0700 Subject: [PATCH 486/759] Remove useless function (#10472) --- comfy/model_management.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 79d6ff9d4..cf015a29a 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -998,12 +998,6 @@ def device_supports_non_blocking(device): return False return True -def device_should_use_non_blocking(device): - if not device_supports_non_blocking(device): - return False - return False - # return True #TODO: figure out why this causes memory issues on Nvidia and possibly others - def force_channels_last(): if args.force_channels_last: return True From e86b79ab9ea7e740b80490353f3f5763840ede81 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 26 Oct 2025 00:35:30 +0300 Subject: [PATCH 487/759] convert Gemini API nodes to V3 schema (#10476) --- comfy_api_nodes/apinode_utils.py | 26 -- comfy_api_nodes/nodes_gemini.py | 629 +++++++++++----------------- comfy_api_nodes/util/__init__.py | 2 + comfy_api_nodes/util/conversions.py | 25 ++ 4 files changed, 282 insertions(+), 400 deletions(-) diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py index e3d282059..4182c8f80 100644 --- a/comfy_api_nodes/apinode_utils.py +++ b/comfy_api_nodes/apinode_utils.py @@ -3,8 +3,6 @@ import aiohttp import mimetypes from typing import Optional, Union from comfy.utils import common_upscale -from comfy_api.util import VideoContainer, VideoCodec -from comfy_api.input.video_types import VideoInput from comfy_api_nodes.apis.client import ( ApiClient, ApiEndpoint, @@ -209,30 +207,6 @@ async def upload_file_to_comfyapi( return response.download_url -def video_to_base64_string( - video: VideoInput, - container_format: VideoContainer = None, - codec: VideoCodec = None -) -> str: - """ - Converts a video input to a base64 string. - - Args: - video: The video input to convert - container_format: Optional container format to use (defaults to video.container if available) - codec: Optional codec to use (defaults to video.codec if available) - """ - video_bytes_io = BytesIO() - - # Use provided format/codec if specified, otherwise use video's own if available - format_to_use = container_format if container_format is not None else getattr(video, 'container', VideoContainer.MP4) - codec_to_use = codec if codec is not None else getattr(video, 'codec', VideoCodec.H264) - - video.save_to(video_bytes_io, format=format_to_use, codec=codec_to_use) - video_bytes_io.seek(0) - return base64.b64encode(video_bytes_io.getvalue()).decode("utf-8") - - async def upload_images_to_comfyapi( image: torch.Tensor, max_images=8, diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index ca11b67ed..67f2469ad 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -2,42 +2,47 @@ API Nodes for Gemini Multimodal LLM Usage via Remote API See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference """ + from __future__ import annotations -import json -import time -import os -import uuid import base64 -from io import BytesIO +import json +import os +import time +import uuid from enum import Enum -from typing import Optional, Literal +from io import BytesIO +from typing import Literal, Optional import torch +from typing_extensions import override import folder_paths -from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeDict -from server import PromptServer +from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api.util import VideoCodec, VideoContainer from comfy_api_nodes.apis import ( GeminiContent, GeminiGenerateContentRequest, GeminiGenerateContentResponse, GeminiInlineData, - GeminiPart, GeminiMimeType, + GeminiPart, ) -from comfy_api_nodes.apis.gemini_api import GeminiImageGenerationConfig, GeminiImageGenerateContentRequest, GeminiImageConfig -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.apis.gemini_api import ( + GeminiImageConfig, + GeminiImageGenerateContentRequest, + GeminiImageGenerationConfig, +) +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, -) -from comfy_api_nodes.apinode_utils import ( + audio_to_base64_string, + bytesio_to_image_tensor, + sync_op, + tensor_to_base64_string, + validate_string, video_to_base64_string, ) -from comfy_api_nodes.util import validate_string, tensor_to_base64_string, bytesio_to_image_tensor, audio_to_base64_string -from comfy_api.util import VideoContainer, VideoCodec - +from server import PromptServer GEMINI_BASE_ENDPOINT = "/proxy/vertexai/gemini" GEMINI_MAX_INPUT_FILE_SIZE = 20 * 1024 * 1024 # 20 MB @@ -63,50 +68,6 @@ class GeminiImageModel(str, Enum): gemini_2_5_flash_image = "gemini-2.5-flash-image" -def get_gemini_endpoint( - model: GeminiModel, -) -> ApiEndpoint[GeminiGenerateContentRequest, GeminiGenerateContentResponse]: - """ - Get the API endpoint for a given Gemini model. - - Args: - model: The Gemini model to use, either as enum or string value. - - Returns: - ApiEndpoint configured for the specific Gemini model. - """ - if isinstance(model, str): - model = GeminiModel(model) - return ApiEndpoint( - path=f"{GEMINI_BASE_ENDPOINT}/{model.value}", - method=HttpMethod.POST, - request_model=GeminiGenerateContentRequest, - response_model=GeminiGenerateContentResponse, - ) - - -def get_gemini_image_endpoint( - model: GeminiImageModel, -) -> ApiEndpoint[GeminiGenerateContentRequest, GeminiGenerateContentResponse]: - """ - Get the API endpoint for a given Gemini model. - - Args: - model: The Gemini model to use, either as enum or string value. - - Returns: - ApiEndpoint configured for the specific Gemini model. - """ - if isinstance(model, str): - model = GeminiImageModel(model) - return ApiEndpoint( - path=f"{GEMINI_BASE_ENDPOINT}/{model.value}", - method=HttpMethod.POST, - request_model=GeminiImageGenerateContentRequest, - response_model=GeminiGenerateContentResponse, - ) - - def create_image_parts(image_input: torch.Tensor) -> list[GeminiPart]: """ Convert image tensor input to Gemini API compatible parts. @@ -119,9 +80,7 @@ def create_image_parts(image_input: torch.Tensor) -> list[GeminiPart]: """ image_parts: list[GeminiPart] = [] for image_index in range(image_input.shape[0]): - image_as_b64 = tensor_to_base64_string( - image_input[image_index].unsqueeze(0) - ) + image_as_b64 = tensor_to_base64_string(image_input[image_index].unsqueeze(0)) image_parts.append( GeminiPart( inlineData=GeminiInlineData( @@ -133,37 +92,7 @@ def create_image_parts(image_input: torch.Tensor) -> list[GeminiPart]: return image_parts -def create_text_part(text: str) -> GeminiPart: - """ - Create a text part for the Gemini API request. - - Args: - text: The text content to include in the request. - - Returns: - A GeminiPart object with the text content. - """ - return GeminiPart(text=text) - - -def get_parts_from_response( - response: GeminiGenerateContentResponse -) -> list[GeminiPart]: - """ - Extract all parts from the Gemini API response. - - Args: - response: The API response from Gemini. - - Returns: - List of response parts from the first candidate. - """ - return response.candidates[0].content.parts - - -def get_parts_by_type( - response: GeminiGenerateContentResponse, part_type: Literal["text"] | str -) -> list[GeminiPart]: +def get_parts_by_type(response: GeminiGenerateContentResponse, part_type: Literal["text"] | str) -> list[GeminiPart]: """ Filter response parts by their type. @@ -175,14 +104,10 @@ def get_parts_by_type( List of response parts matching the requested type. """ parts = [] - for part in get_parts_from_response(response): + for part in response.candidates[0].content.parts: if part_type == "text" and hasattr(part, "text") and part.text: parts.append(part) - elif ( - hasattr(part, "inlineData") - and part.inlineData - and part.inlineData.mimeType == part_type - ): + elif hasattr(part, "inlineData") and part.inlineData and part.inlineData.mimeType == part_type: parts.append(part) # Skip parts that don't match the requested type return parts @@ -210,11 +135,11 @@ def get_image_from_response(response: GeminiGenerateContentResponse) -> torch.Te returned_image = bytesio_to_image_tensor(BytesIO(image_data)) image_tensors.append(returned_image) if len(image_tensors) == 0: - return torch.zeros((1,1024,1024,4)) + return torch.zeros((1, 1024, 1024, 4)) return torch.cat(image_tensors, dim=0) -class GeminiNode(ComfyNodeABC): +class GeminiNode(IO.ComfyNode): """ Node to generate text responses from a Gemini model. @@ -225,96 +150,79 @@ class GeminiNode(ComfyNodeABC): """ @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text inputs to the model, used to generate a response. You can include detailed instructions, questions, or context for the model.", - }, + def define_schema(cls): + return IO.Schema( + node_id="GeminiNode", + display_name="Google Gemini", + category="api node/text/Gemini", + description="Generate text responses with Google's Gemini AI model. " + "You can provide multiple types of inputs (text, images, audio, video) " + "as context for generating more relevant and meaningful responses.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text inputs to the model, used to generate a response. " + "You can include detailed instructions, questions, or context for the model.", ), - "model": ( - IO.COMBO, - { - "tooltip": "The Gemini model to use for generating responses.", - "options": [model.value for model in GeminiModel], - "default": GeminiModel.gemini_2_5_pro.value, - }, + IO.Combo.Input( + "model", + options=GeminiModel, + default=GeminiModel.gemini_2_5_pro, + tooltip="The Gemini model to use for generating responses.", ), - "seed": ( - IO.INT, - { - "default": 42, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "When seed is fixed to a specific value, the model makes a best effort to provide the same response for repeated requests. Deterministic output isn't guaranteed. Also, changing the model or parameter settings, such as the temperature, can cause variations in the response even when you use the same seed value. By default, a random seed value is used.", - }, + IO.Int.Input( + "seed", + default=42, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="When seed is fixed to a specific value, the model makes a best effort to provide " + "the same response for repeated requests. Deterministic output isn't guaranteed. " + "Also, changing the model or parameter settings, such as the temperature, " + "can cause variations in the response even when you use the same seed value. " + "By default, a random seed value is used.", ), - }, - "optional": { - "images": ( - IO.IMAGE, - { - "default": None, - "tooltip": "Optional image(s) to use as context for the model. To include multiple images, you can use the Batch Images node.", - }, + IO.Image.Input( + "images", + optional=True, + tooltip="Optional image(s) to use as context for the model. " + "To include multiple images, you can use the Batch Images node.", ), - "audio": ( - IO.AUDIO, - { - "tooltip": "Optional audio to use as context for the model.", - "default": None, - }, + IO.Audio.Input( + "audio", + optional=True, + tooltip="Optional audio to use as context for the model.", ), - "video": ( - IO.VIDEO, - { - "tooltip": "Optional video to use as context for the model.", - "default": None, - }, + IO.Video.Input( + "video", + optional=True, + tooltip="Optional video to use as context for the model.", ), - "files": ( - "GEMINI_INPUT_FILES", - { - "default": None, - "tooltip": "Optional file(s) to use as context for the model. Accepts inputs from the Gemini Generate Content Input Files node.", - }, + IO.Custom("GEMINI_INPUT_FILES").Input( + "files", + optional=True, + tooltip="Optional file(s) to use as context for the model. " + "Accepts inputs from the Gemini Generate Content Input Files node.", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - DESCRIPTION = "Generate text responses with Google's Gemini AI model. You can provide multiple types of inputs (text, images, audio, video) as context for generating more relevant and meaningful responses." - RETURN_TYPES = ("STRING",) - FUNCTION = "api_call" - CATEGORY = "api node/text/Gemini" - API_NODE = True - - def create_video_parts(self, video_input: IO.VIDEO, **kwargs) -> list[GeminiPart]: - """ - Convert video input to Gemini API compatible parts. - - Args: - video_input: Video tensor from ComfyUI. - **kwargs: Additional arguments to pass to the conversion function. - - Returns: - List of GeminiPart objects containing the encoded video. - """ - - base_64_string = video_to_base64_string( - video_input, - container_format=VideoContainer.MP4, - codec=VideoCodec.H264 + ], + outputs=[ + IO.String.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, ) + + @classmethod + def create_video_parts(cls, video_input: Input.Video) -> list[GeminiPart]: + """Convert video input to Gemini API compatible parts.""" + + base_64_string = video_to_base64_string(video_input, container_format=VideoContainer.MP4, codec=VideoCodec.H264) return [ GeminiPart( inlineData=GeminiInlineData( @@ -324,7 +232,8 @@ class GeminiNode(ComfyNodeABC): ) ] - def create_audio_parts(self, audio_input: IO.AUDIO) -> list[GeminiPart]: + @classmethod + def create_audio_parts(cls, audio_input: Input.Audio) -> list[GeminiPart]: """ Convert audio input to Gemini API compatible parts. @@ -337,10 +246,10 @@ class GeminiNode(ComfyNodeABC): audio_parts: list[GeminiPart] = [] for batch_index in range(audio_input["waveform"].shape[0]): # Recreate an IO.AUDIO object for the given batch dimension index - audio_at_index = { - "waveform": audio_input["waveform"][batch_index].unsqueeze(0), - "sample_rate": audio_input["sample_rate"], - } + audio_at_index = Input.Audio( + waveform=audio_input["waveform"][batch_index].unsqueeze(0), + sample_rate=audio_input["sample_rate"], + ) # Convert to MP3 format for compatibility with Gemini API audio_bytes = audio_to_base64_string( audio_at_index, @@ -357,38 +266,38 @@ class GeminiNode(ComfyNodeABC): ) return audio_parts - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, - model: GeminiModel, - images: Optional[IO.IMAGE] = None, - audio: Optional[IO.AUDIO] = None, - video: Optional[IO.VIDEO] = None, + model: str, + seed: int, + images: Optional[torch.Tensor] = None, + audio: Optional[Input.Audio] = None, + video: Optional[Input.Video] = None, files: Optional[list[GeminiPart]] = None, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[str]: - # Validate inputs + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) # Create parts list with text prompt as the first part - parts: list[GeminiPart] = [create_text_part(prompt)] + parts: list[GeminiPart] = [GeminiPart(text=prompt)] # Add other modal parts if images is not None: image_parts = create_image_parts(images) parts.extend(image_parts) if audio is not None: - parts.extend(self.create_audio_parts(audio)) + parts.extend(cls.create_audio_parts(audio)) if video is not None: - parts.extend(self.create_video_parts(video)) + parts.extend(cls.create_video_parts(video)) if files is not None: parts.extend(files) # Create response - response = await SynchronousOperation( - endpoint=get_gemini_endpoint(model), - request=GeminiGenerateContentRequest( + response = await sync_op( + cls, + endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"), + data=GeminiGenerateContentRequest( contents=[ GeminiContent( role="user", @@ -396,15 +305,15 @@ class GeminiNode(ComfyNodeABC): ) ] ), - auth_kwargs=kwargs, - ).execute() + response_model=GeminiGenerateContentResponse, + ) # Get result output output_text = get_text_from_response(response) - if unique_id and output_text: + if output_text: # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. render_spec = { - "node_id": unique_id, + "node_id": cls.hidden.unique_id, "component": "ChatHistoryWidget", "props": { "history": json.dumps( @@ -424,10 +333,10 @@ class GeminiNode(ComfyNodeABC): render_spec, ) - return (output_text or "Empty response from Gemini model...",) + return IO.NodeOutput(output_text or "Empty response from Gemini model...") -class GeminiInputFiles(ComfyNodeABC): +class GeminiInputFiles(IO.ComfyNode): """ Loads and formats input files for use with the Gemini API. @@ -438,7 +347,7 @@ class GeminiInputFiles(ComfyNodeABC): """ @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: + def define_schema(cls): """ For details about the supported file input types, see: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference @@ -453,39 +362,37 @@ class GeminiInputFiles(ComfyNodeABC): ] input_files = sorted(input_files, key=lambda x: x.name) input_files = [f.name for f in input_files] - return { - "required": { - "file": ( - IO.COMBO, - { - "tooltip": "Input files to include as context for the model. Only accepts text (.txt) and PDF (.pdf) files for now.", - "options": input_files, - "default": input_files[0] if input_files else None, - }, + return IO.Schema( + node_id="GeminiInputFiles", + display_name="Gemini Input Files", + category="api node/text/Gemini", + description="Loads and prepares input files to include as inputs for Gemini LLM nodes. " + "The files will be read by the Gemini model when generating a response. " + "The contents of the text file count toward the token limit. " + "🛈 TIP: Can be chained together with other Gemini Input File nodes.", + inputs=[ + IO.Combo.Input( + "file", + options=input_files, + default=input_files[0] if input_files else None, + tooltip="Input files to include as context for the model. " + "Only accepts text (.txt) and PDF (.pdf) files for now.", ), - }, - "optional": { - "GEMINI_INPUT_FILES": ( + IO.Custom("GEMINI_INPUT_FILES").Input( "GEMINI_INPUT_FILES", - { - "tooltip": "An optional additional file(s) to batch together with the file loaded from this node. Allows chaining of input files so that a single message can include multiple input files.", - "default": None, - }, + optional=True, + tooltip="An optional additional file(s) to batch together with the file loaded from this node. " + "Allows chaining of input files so that a single message can include multiple input files.", ), - }, - } - - DESCRIPTION = "Loads and prepares input files to include as inputs for Gemini LLM nodes. The files will be read by the Gemini model when generating a response. The contents of the text file count toward the token limit. 🛈 TIP: Can be chained together with other Gemini Input File nodes." - RETURN_TYPES = ("GEMINI_INPUT_FILES",) - FUNCTION = "prepare_files" - CATEGORY = "api node/text/Gemini" - - def create_file_part(self, file_path: str) -> GeminiPart: - mime_type = ( - GeminiMimeType.application_pdf - if file_path.endswith(".pdf") - else GeminiMimeType.text_plain + ], + outputs=[ + IO.Custom("GEMINI_INPUT_FILES").Output(), + ], ) + + @classmethod + def create_file_part(cls, file_path: str) -> GeminiPart: + mime_type = GeminiMimeType.application_pdf if file_path.endswith(".pdf") else GeminiMimeType.text_plain # Use base64 string directly, not the data URI with open(file_path, "rb") as f: file_content = f.read() @@ -498,120 +405,95 @@ class GeminiInputFiles(ComfyNodeABC): ) ) - def prepare_files( - self, file: str, GEMINI_INPUT_FILES: list[GeminiPart] = [] - ) -> tuple[list[GeminiPart]]: - """ - Loads and formats input files for Gemini API. - """ - file_path = folder_paths.get_annotated_filepath(file) - input_file_content = self.create_file_part(file_path) - files = [input_file_content] + GEMINI_INPUT_FILES - return (files,) - - -class GeminiImage(ComfyNodeABC): - """ - Node to generate text and image responses from a Gemini model. - - This node allows users to interact with Google's Gemini AI models, providing - multimodal inputs (text, images, files) to generate coherent - text and image responses. The node works with the latest Gemini models, handling the - API communication and response parsing. - """ @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text prompt for generation", - }, - ), - "model": ( - IO.COMBO, - { - "tooltip": "The Gemini model to use for generating responses.", - "options": [model.value for model in GeminiImageModel], - "default": GeminiImageModel.gemini_2_5_flash_image.value, - }, - ), - "seed": ( - IO.INT, - { - "default": 42, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "When seed is fixed to a specific value, the model makes a best effort to provide the same response for repeated requests. Deterministic output isn't guaranteed. Also, changing the model or parameter settings, such as the temperature, can cause variations in the response even when you use the same seed value. By default, a random seed value is used.", - }, - ), - }, - "optional": { - "images": ( - IO.IMAGE, - { - "default": None, - "tooltip": "Optional image(s) to use as context for the model. To include multiple images, you can use the Batch Images node.", - }, - ), - "files": ( - "GEMINI_INPUT_FILES", - { - "default": None, - "tooltip": "Optional file(s) to use as context for the model. Accepts inputs from the Gemini Generate Content Input Files node.", - }, - ), - # TODO: later we can add this parameter later - # "n": ( - # IO.INT, - # { - # "default": 1, - # "min": 1, - # "max": 8, - # "step": 1, - # "display": "number", - # "tooltip": "How many images to generate", - # }, - # ), - "aspect_ratio": ( - IO.COMBO, - { - "tooltip": "Defaults to matching the output image size to that of your input image, or otherwise generates 1:1 squares.", - "options": ["auto", "1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"], - "default": "auto", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def execute(cls, file: str, GEMINI_INPUT_FILES: Optional[list[GeminiPart]] = None) -> IO.NodeOutput: + """Loads and formats input files for Gemini API.""" + if GEMINI_INPUT_FILES is None: + GEMINI_INPUT_FILES = [] + file_path = folder_paths.get_annotated_filepath(file) + input_file_content = cls.create_file_part(file_path) + return IO.NodeOutput([input_file_content] + GEMINI_INPUT_FILES) - RETURN_TYPES = (IO.IMAGE, IO.STRING) - FUNCTION = "api_call" - CATEGORY = "api node/image/Gemini" - DESCRIPTION = "Edit images synchronously via Google API." - API_NODE = True - async def api_call( - self, +class GeminiImage(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="GeminiImageNode", + display_name="Google Gemini Image", + category="api node/image/Gemini", + description="Edit images synchronously via Google API.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + tooltip="Text prompt for generation", + default="", + ), + IO.Combo.Input( + "model", + options=GeminiImageModel, + default=GeminiImageModel.gemini_2_5_flash_image, + tooltip="The Gemini model to use for generating responses.", + ), + IO.Int.Input( + "seed", + default=42, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="When seed is fixed to a specific value, the model makes a best effort to provide " + "the same response for repeated requests. Deterministic output isn't guaranteed. " + "Also, changing the model or parameter settings, such as the temperature, " + "can cause variations in the response even when you use the same seed value. " + "By default, a random seed value is used.", + ), + IO.Image.Input( + "images", + optional=True, + tooltip="Optional image(s) to use as context for the model. " + "To include multiple images, you can use the Batch Images node.", + ), + IO.Custom("GEMINI_INPUT_FILES").Input( + "files", + optional=True, + tooltip="Optional file(s) to use as context for the model. " + "Accepts inputs from the Gemini Generate Content Input Files node.", + ), + IO.Combo.Input( + "aspect_ratio", + options=["auto", "1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"], + default="auto", + tooltip="Defaults to matching the output image size to that of your input image, " + "or otherwise generates 1:1 squares.", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + IO.String.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, prompt: str, - model: GeminiImageModel, - images: Optional[IO.IMAGE] = None, + model: str, + seed: int, + images: Optional[torch.Tensor] = None, files: Optional[list[GeminiPart]] = None, - n=1, aspect_ratio: str = "auto", - unique_id: Optional[str] = None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) - parts: list[GeminiPart] = [create_text_part(prompt)] + parts: list[GeminiPart] = [GeminiPart(text=prompt)] if not aspect_ratio: aspect_ratio = "auto" # for backward compatability with old workflows; to-do remove this in December @@ -623,29 +505,27 @@ class GeminiImage(ComfyNodeABC): if files is not None: parts.extend(files) - response = await SynchronousOperation( - endpoint=get_gemini_image_endpoint(model), - request=GeminiImageGenerateContentRequest( + response = await sync_op( + cls, + endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"), + data=GeminiImageGenerateContentRequest( contents=[ - GeminiContent( - role="user", - parts=parts, - ), + GeminiContent(role="user", parts=parts), ], generationConfig=GeminiImageGenerationConfig( - responseModalities=["TEXT","IMAGE"], + responseModalities=["TEXT", "IMAGE"], imageConfig=None if aspect_ratio == "auto" else image_config, - ) + ), ), - auth_kwargs=kwargs, - ).execute() + response_model=GeminiGenerateContentResponse, + ) output_image = get_image_from_response(response) output_text = get_text_from_response(response) - if unique_id and output_text: + if output_text: # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. render_spec = { - "node_id": unique_id, + "node_id": cls.hidden.unique_id, "component": "ChatHistoryWidget", "props": { "history": json.dumps( @@ -666,17 +546,18 @@ class GeminiImage(ComfyNodeABC): ) output_text = output_text or "Empty response from Gemini model..." - return (output_image, output_text,) + return IO.NodeOutput(output_image, output_text) -NODE_CLASS_MAPPINGS = { - "GeminiNode": GeminiNode, - "GeminiImageNode": GeminiImage, - "GeminiInputFiles": GeminiInputFiles, -} +class GeminiExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + GeminiNode, + GeminiImage, + GeminiInputFiles, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "GeminiNode": "Google Gemini", - "GeminiImageNode": "Google Gemini Image", - "GeminiInputFiles": "Gemini Input Files", -} + +async def comfy_entrypoint() -> GeminiExtension: + return GeminiExtension() diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py index ab96760cb..0cca2b59b 100644 --- a/comfy_api_nodes/util/__init__.py +++ b/comfy_api_nodes/util/__init__.py @@ -18,6 +18,7 @@ from .conversions import ( tensor_to_bytesio, tensor_to_pil, trim_video, + video_to_base64_string, ) from .download_helpers import ( download_url_as_bytesio, @@ -73,6 +74,7 @@ __all__ = [ "tensor_to_bytesio", "tensor_to_pil", "trim_video", + "video_to_base64_string", # Validation utilities "get_number_of_images", "validate_aspect_ratio_closeness", diff --git a/comfy_api_nodes/util/conversions.py b/comfy_api_nodes/util/conversions.py index 10cd1051b..9f4c90c5c 100644 --- a/comfy_api_nodes/util/conversions.py +++ b/comfy_api_nodes/util/conversions.py @@ -12,6 +12,7 @@ from PIL import Image from comfy.utils import common_upscale from comfy_api.latest import Input, InputImpl +from comfy_api.util import VideoContainer, VideoCodec from ._helpers import mimetype_to_extension @@ -173,6 +174,30 @@ def audio_to_base64_string(audio: Input.Audio, container_format: str = "mp4", co return base64.b64encode(audio_bytes).decode("utf-8") +def video_to_base64_string( + video: Input.Video, + container_format: VideoContainer = None, + codec: VideoCodec = None +) -> str: + """ + Converts a video input to a base64 string. + + Args: + video: The video input to convert + container_format: Optional container format to use (defaults to video.container if available) + codec: Optional codec to use (defaults to video.codec if available) + """ + video_bytes_io = BytesIO() + + # Use provided format/codec if specified, otherwise use video's own if available + format_to_use = container_format if container_format is not None else getattr(video, 'container', VideoContainer.MP4) + codec_to_use = codec if codec is not None else getattr(video, 'codec', VideoCodec.H264) + + video.save_to(video_bytes_io, format=format_to_use, codec=codec_to_use) + video_bytes_io.seek(0) + return base64.b64encode(video_bytes_io.getvalue()).decode("utf-8") + + def audio_ndarray_to_bytesio( audio_data_np: np.ndarray, sample_rate: int, From 098a352f136c610071bcb74f13e5b0ca16e6e7b3 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 25 Oct 2025 17:05:22 -0700 Subject: [PATCH 488/759] Add warning for torch-directml usage (#10482) Added a warning message about the state of torch-directml. --- comfy/model_management.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/model_management.py b/comfy/model_management.py index cf015a29a..afe78f36e 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -89,6 +89,7 @@ if args.deterministic: directml_enabled = False if args.directml is not None: + logging.warning("WARNING: torch-directml barely works, is very slow, has not been updated in over 1 year and might be removed soon, please don't use it, there are better options.") import torch_directml directml_enabled = True device_index = args.directml From f6bbc1ac846b7d9a73ae50c3a45cf5a41058c54d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 25 Oct 2025 20:07:29 -0700 Subject: [PATCH 489/759] Fix mistake. (#10484) --- comfy/sample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/sample.py b/comfy/sample.py index b1395da84..2f8f3a51c 100644 --- a/comfy/sample.py +++ b/comfy/sample.py @@ -17,7 +17,7 @@ def prepare_noise_inner(latent_image, generator, noise_inds=None): if i in unique_inds: noises.append(noise) noises = [noises[i] for i in inverse] - noises = torch.cat(noises, axis=0) + return torch.cat(noises, axis=0) def prepare_noise(latent_image, seed, noise_inds=None): """ From 9d529e53084bdec28f684f3886a26c93598e7338 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sun, 26 Oct 2025 08:51:06 +0200 Subject: [PATCH 490/759] fix(api-nodes): random issues on Windows by capturing general OSError for retries (#10486) --- comfy_api_nodes/util/client.py | 15 +++++---------- comfy_api_nodes/util/download_helpers.py | 6 +++--- comfy_api_nodes/util/upload_helpers.py | 4 ++-- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py index 5833b118f..9c036d64b 100644 --- a/comfy_api_nodes/util/client.py +++ b/comfy_api_nodes/util/client.py @@ -2,7 +2,6 @@ import asyncio import contextlib import json import logging -import socket import time import uuid from dataclasses import dataclass @@ -456,24 +455,20 @@ async def _diagnose_connectivity() -> dict[str, bool]: results = { "internet_accessible": False, "api_accessible": False, - "is_local_issue": False, - "is_api_issue": False, } timeout = aiohttp.ClientTimeout(total=5.0) async with aiohttp.ClientSession(timeout=timeout) as session: - try: + with contextlib.suppress(ClientError, OSError): async with session.get("https://www.google.com") as resp: results["internet_accessible"] = resp.status < 500 - except (ClientError, asyncio.TimeoutError, socket.gaierror): - results["is_local_issue"] = True + if not results["internet_accessible"]: return results parsed = urlparse(default_base_url()) health_url = f"{parsed.scheme}://{parsed.netloc}/health" - with contextlib.suppress(ClientError, asyncio.TimeoutError): + with contextlib.suppress(ClientError, OSError): async with session.get(health_url) as resp: results["api_accessible"] = resp.status < 500 - results["is_api_issue"] = results["internet_accessible"] and not results["api_accessible"] return results @@ -790,7 +785,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): except ProcessingInterrupted: logging.debug("Polling was interrupted by user") raise - except (ClientError, asyncio.TimeoutError, socket.gaierror) as e: + except (ClientError, OSError) as e: if attempt <= cfg.max_retries: logging.warning( "Connection error calling %s %s. Retrying in %.2fs (%d/%d): %s", @@ -824,7 +819,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): delay *= cfg.retry_backoff continue diag = await _diagnose_connectivity() - if diag.get("is_local_issue"): + if not diag["internet_accessible"]: try: request_logger.log_request_response( operation_id=operation_id, diff --git a/comfy_api_nodes/util/download_helpers.py b/comfy_api_nodes/util/download_helpers.py index 791dd5a50..f89045e12 100644 --- a/comfy_api_nodes/util/download_helpers.py +++ b/comfy_api_nodes/util/download_helpers.py @@ -32,7 +32,7 @@ async def download_url_to_bytesio( dest: Optional[Union[BytesIO, IO[bytes], str, Path]], *, timeout: Optional[float] = None, - max_retries: int = 3, + max_retries: int = 5, retry_delay: float = 1.0, retry_backoff: float = 2.0, cls: type[COMFY_IO.ComfyNode] = None, @@ -177,7 +177,7 @@ async def download_url_to_bytesio( return except asyncio.CancelledError: raise ProcessingInterrupted("Task cancelled") from None - except (ClientError, asyncio.TimeoutError) as e: + except (ClientError, OSError) as e: if attempt <= max_retries: with contextlib.suppress(Exception): request_logger.log_request_response( @@ -191,7 +191,7 @@ async def download_url_to_bytesio( continue diag = await _diagnose_connectivity() - if diag.get("is_local_issue"): + if not diag["internet_accessible"]: raise LocalNetworkError( "Unable to connect to the network. Please check your internet connection and try again." ) from e diff --git a/comfy_api_nodes/util/upload_helpers.py b/comfy_api_nodes/util/upload_helpers.py index a345d451d..7bfc61704 100644 --- a/comfy_api_nodes/util/upload_helpers.py +++ b/comfy_api_nodes/util/upload_helpers.py @@ -290,7 +290,7 @@ async def upload_file( return except asyncio.CancelledError: raise ProcessingInterrupted("Task cancelled") from None - except (aiohttp.ClientError, asyncio.TimeoutError) as e: + except (aiohttp.ClientError, OSError) as e: if attempt <= max_retries: with contextlib.suppress(Exception): request_logger.log_request_response( @@ -313,7 +313,7 @@ async def upload_file( continue diag = await _diagnose_connectivity() - if diag.get("is_local_issue"): + if not diag["internet_accessible"]: raise LocalNetworkError( "Unable to connect to the network. Please check your internet connection and try again." ) from e From c170fd2db598a0bdce56f80e22e83e10ad731421 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 26 Oct 2025 17:23:01 -0700 Subject: [PATCH 491/759] Bump portable deps workflow to torch cu130 python 3.13.9 (#10493) --- .github/workflows/windows_release_dependencies.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/windows_release_dependencies.yml b/.github/workflows/windows_release_dependencies.yml index f1e2946e6..f61ee21a2 100644 --- a/.github/workflows/windows_release_dependencies.yml +++ b/.github/workflows/windows_release_dependencies.yml @@ -17,7 +17,7 @@ on: description: 'cuda version' required: true type: string - default: "129" + default: "130" python_minor: description: 'python minor version' @@ -29,7 +29,7 @@ on: description: 'python patch version' required: true type: string - default: "6" + default: "9" # push: # branches: # - master From 601ee1775a3c06c9b4de1fa7d808af8625b2fcd5 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 27 Oct 2025 20:54:00 -0700 Subject: [PATCH 492/759] Add a bat to run comfyui portable without api nodes. (#10504) --- .../advanced/run_nvidia_gpu_disable_api_nodes.bat | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat diff --git a/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat b/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat new file mode 100644 index 000000000..cfe4b9f0e --- /dev/null +++ b/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat @@ -0,0 +1,2 @@ +..\python_embeded\python.exe -s ..\ComfyUI\main.py --windows-standalone-build --disable-api-nodes +pause From c305deed56a6ed259563b2047d9fcd51471e6590 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 28 Oct 2025 13:24:16 +0800 Subject: [PATCH 493/759] Update template to 0.2.3 (#10503) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8570c66b6..121301669 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.28.7 -comfyui-workflow-templates==0.2.2 +comfyui-workflow-templates==0.2.3 comfyui-embedded-docs==0.3.0 torch torchsde From 55bad303754eb60fa98f3ccf598e95502b819149 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 28 Oct 2025 07:25:29 +0200 Subject: [PATCH 494/759] feat(api-nodes): add LTXV API nodes (#10496) --- comfy_api_nodes/nodes_ltxv.py | 191 ++++++++++++++++++++++++++++++++++ nodes.py | 1 + 2 files changed, 192 insertions(+) create mode 100644 comfy_api_nodes/nodes_ltxv.py diff --git a/comfy_api_nodes/nodes_ltxv.py b/comfy_api_nodes/nodes_ltxv.py new file mode 100644 index 000000000..e6ad6e27a --- /dev/null +++ b/comfy_api_nodes/nodes_ltxv.py @@ -0,0 +1,191 @@ +from io import BytesIO +from typing import Optional + +import torch +from pydantic import BaseModel, Field +from typing_extensions import override + +from comfy_api.input_impl import VideoFromFile +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.util import ( + ApiEndpoint, + get_number_of_images, + sync_op_raw, + upload_images_to_comfyapi, + validate_string, +) + +MODELS_MAP = { + "LTX-2 (Pro)": "ltx-2-pro", + "LTX-2 (Fast)": "ltx-2-fast", +} + + +class ExecuteTaskRequest(BaseModel): + prompt: str = Field(...) + model: str = Field(...) + duration: int = Field(...) + resolution: str = Field(...) + fps: Optional[int] = Field(25) + generate_audio: Optional[bool] = Field(True) + image_uri: Optional[str] = Field(None) + + +class TextToVideoNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="LtxvApiTextToVideo", + display_name="LTXV Text To Video", + category="api node/video/LTXV", + description="Professional-quality videos with customizable duration and resolution.", + inputs=[ + IO.Combo.Input("model", options=list(MODELS_MAP.keys())), + IO.String.Input( + "prompt", + multiline=True, + default="", + ), + IO.Combo.Input("duration", options=[6, 8, 10], default=8), + IO.Combo.Input( + "resolution", + options=[ + "1920x1080", + "2560x1440", + "3840x2160", + ], + ), + IO.Combo.Input("fps", options=[25, 50], default=25), + IO.Boolean.Input( + "generate_audio", + default=False, + optional=True, + tooltip="When true, the generated video will include AI-generated audio matching the scene.", + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + duration: int, + resolution: str, + fps: int = 25, + generate_audio: bool = False, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=10000) + response = await sync_op_raw( + cls, + ApiEndpoint("/proxy/ltx/v1/text-to-video", "POST"), + data=ExecuteTaskRequest( + prompt=prompt, + model=MODELS_MAP[model], + duration=duration, + resolution=resolution, + fps=fps, + generate_audio=generate_audio, + ), + as_binary=True, + max_retries=1, + ) + return IO.NodeOutput(VideoFromFile(BytesIO(response))) + + +class ImageToVideoNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="LtxvApiImageToVideo", + display_name="LTXV Image To Video", + category="api node/video/LTXV", + description="Professional-quality videos with customizable duration and resolution based on start image.", + inputs=[ + IO.Image.Input("image", tooltip="First frame to be used for the video."), + IO.Combo.Input("model", options=list(MODELS_MAP.keys())), + IO.String.Input( + "prompt", + multiline=True, + default="", + ), + IO.Combo.Input("duration", options=[6, 8, 10], default=8), + IO.Combo.Input( + "resolution", + options=[ + "1920x1080", + "2560x1440", + "3840x2160", + ], + ), + IO.Combo.Input("fps", options=[25, 50], default=25), + IO.Boolean.Input( + "generate_audio", + default=False, + optional=True, + tooltip="When true, the generated video will include AI-generated audio matching the scene.", + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + image: torch.Tensor, + model: str, + prompt: str, + duration: int, + resolution: str, + fps: int = 25, + generate_audio: bool = False, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=10000) + if get_number_of_images(image) != 1: + raise ValueError("Currently only one input image is supported.") + response = await sync_op_raw( + cls, + ApiEndpoint("/proxy/ltx/v1/image-to-video", "POST"), + data=ExecuteTaskRequest( + image_uri=(await upload_images_to_comfyapi(cls, image, max_images=1, mime_type="image/png"))[0], + prompt=prompt, + model=MODELS_MAP[model], + duration=duration, + resolution=resolution, + fps=fps, + generate_audio=generate_audio, + ), + as_binary=True, + max_retries=1, + ) + return IO.NodeOutput(VideoFromFile(BytesIO(response))) + + +class LtxvApiExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + TextToVideoNode, + ImageToVideoNode, + ] + + +async def comfy_entrypoint() -> LtxvApiExtension: + return LtxvApiExtension() diff --git a/nodes.py b/nodes.py index 7cfa8ca14..12e365ca9 100644 --- a/nodes.py +++ b/nodes.py @@ -2349,6 +2349,7 @@ async def init_builtin_api_nodes(): "nodes_kling.py", "nodes_bfl.py", "nodes_bytedance.py", + "nodes_ltxv.py", "nodes_luma.py", "nodes_recraft.py", "nodes_pixverse.py", From 6abc30aae9bd13f31dafd32552a365f2df2cf715 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 28 Oct 2025 13:56:30 +0800 Subject: [PATCH 495/759] Update template to 0.2.4 (#10505) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 121301669..cc3d4ca94 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.28.7 -comfyui-workflow-templates==0.2.3 +comfyui-workflow-templates==0.2.4 comfyui-embedded-docs==0.3.0 torch torchsde From 614b8d3345424481d94a22fe7496d908c1a5c526 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Tue, 28 Oct 2025 00:01:13 -0700 Subject: [PATCH 496/759] frontend bump to 1.28.8 (#10506) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index cc3d4ca94..4d84b0d3e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.28.7 +comfyui-frontend-package==1.28.8 comfyui-workflow-templates==0.2.4 comfyui-embedded-docs==0.3.0 torch From f2bb3230b796f6a486894fc3b597db2c0b9538c9 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 28 Oct 2025 03:03:59 -0400 Subject: [PATCH 497/759] ComfyUI version v0.3.67 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 33a06bbb0..db48b05c4 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.66" +__version__ = "0.3.67" diff --git a/pyproject.toml b/pyproject.toml index fcc4854a5..ab054355c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.66" +version = "0.3.67" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From b61a40cbc9c2eb648b4d22bb513ed3ab2e2f0fd7 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 28 Oct 2025 00:21:45 -0700 Subject: [PATCH 498/759] Bump stable portable to cu130 python 3.13.9 (#10508) --- .github/workflows/release-stable-all.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release-stable-all.yml b/.github/workflows/release-stable-all.yml index 5c1024599..7dca7277b 100644 --- a/.github/workflows/release-stable-all.yml +++ b/.github/workflows/release-stable-all.yml @@ -18,9 +18,9 @@ jobs: uses: ./.github/workflows/stable-release.yml with: git_tag: ${{ inputs.git_tag }} - cache_tag: "cu129" + cache_tag: "cu130" python_minor: "13" - python_patch: "6" + python_patch: "9" rel_name: "nvidia" rel_extra_name: "" test_release: true From 8cf2ba4ba64203551276513068ee81145e90f0bc Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 28 Oct 2025 00:23:52 -0700 Subject: [PATCH 499/759] Remove comfy api key from queue api. (#10502) --- execution.py | 8 +++----- main.py | 11 +++++++++-- server.py | 11 ++++++++--- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/execution.py b/execution.py index 78c36a4b0..b14bb14c7 100644 --- a/execution.py +++ b/execution.py @@ -1116,7 +1116,7 @@ class PromptQueue: messages: List[str] def task_done(self, item_id, history_result, - status: Optional['PromptQueue.ExecutionStatus']): + status: Optional['PromptQueue.ExecutionStatus'], process_item=None): with self.mutex: prompt = self.currently_running.pop(item_id) if len(self.history) > MAXIMUM_HISTORY_SIZE: @@ -1126,10 +1126,8 @@ class PromptQueue: if status is not None: status_dict = copy.deepcopy(status._asdict()) - # Remove sensitive data from extra_data before storing in history - for sensitive_val in SENSITIVE_EXTRA_DATA_KEYS: - if sensitive_val in prompt[3]: - prompt[3].pop(sensitive_val) + if process_item is not None: + prompt = process_item(prompt) self.history[prompt[1]] = { "prompt": prompt, diff --git a/main.py b/main.py index 4b4c5dcc4..8d466d2eb 100644 --- a/main.py +++ b/main.py @@ -192,14 +192,21 @@ def prompt_worker(q, server_instance): prompt_id = item[1] server_instance.last_prompt_id = prompt_id - e.execute(item[2], prompt_id, item[3], item[4]) + sensitive = item[5] + extra_data = item[3].copy() + for k in sensitive: + extra_data[k] = sensitive[k] + + e.execute(item[2], prompt_id, extra_data, item[4]) need_gc = True + + remove_sensitive = lambda prompt: prompt[:5] + prompt[6:] q.task_done(item_id, e.history_result, status=execution.PromptQueue.ExecutionStatus( status_str='success' if e.success else 'error', completed=e.success, - messages=e.status_messages)) + messages=e.status_messages), process_item=remove_sensitive) if server_instance.client_id is not None: server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id}, server_instance.client_id) diff --git a/server.py b/server.py index fe58db286..5d773b10a 100644 --- a/server.py +++ b/server.py @@ -691,8 +691,9 @@ class PromptServer(): async def get_queue(request): queue_info = {} current_queue = self.prompt_queue.get_current_queue_volatile() - queue_info['queue_running'] = current_queue[0] - queue_info['queue_pending'] = current_queue[1] + remove_sensitive = lambda queue: [x[:5] for x in queue] + queue_info['queue_running'] = remove_sensitive(current_queue[0]) + queue_info['queue_pending'] = remove_sensitive(current_queue[1]) return web.json_response(queue_info) @routes.post("/prompt") @@ -728,7 +729,11 @@ class PromptServer(): extra_data["client_id"] = json_data["client_id"] if valid[0]: outputs_to_execute = valid[2] - self.prompt_queue.put((number, prompt_id, prompt, extra_data, outputs_to_execute)) + sensitive = {} + for sensitive_val in execution.SENSITIVE_EXTRA_DATA_KEYS: + if sensitive_val in extra_data: + sensitive[sensitive_val] = extra_data.pop(sensitive_val) + self.prompt_queue.put((number, prompt_id, prompt, extra_data, outputs_to_execute, sensitive)) response = {"prompt_id": prompt_id, "number": number, "node_errors": valid[3]} return web.json_response(response) else: From 3bea4efc6b23d76c6b0672cd90421a9024e13fdb Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 28 Oct 2025 01:45:45 -0700 Subject: [PATCH 500/759] Tell users to update nvidia drivers if problem with portable. (#10510) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 434d4ff06..4204777e9 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,8 @@ Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you If you have trouble extracting it, right click the file -> properties -> unblock +Update your Nvidia drivers if it doesn't start. + #### Alternative Downloads: [Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z) From 22e40d2ace0f53da025b3a41cbe4b664ef807097 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 28 Oct 2025 12:08:08 -0700 Subject: [PATCH 501/759] Tell users to update their nvidia drivers if portable doesn't start. (#10518) --- .../advanced/run_nvidia_gpu_disable_api_nodes.bat | 1 + .ci/windows_nvidia_base_files/run_nvidia_gpu.bat | 1 + .../run_nvidia_gpu_fast_fp16_accumulation.bat | 1 + 3 files changed, 3 insertions(+) diff --git a/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat b/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat index cfe4b9f0e..ed00583b6 100644 --- a/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat +++ b/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat @@ -1,2 +1,3 @@ ..\python_embeded\python.exe -s ..\ComfyUI\main.py --windows-standalone-build --disable-api-nodes +echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest. pause diff --git a/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat b/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat index 274d7c948..4898a424f 100755 --- a/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat +++ b/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat @@ -1,2 +1,3 @@ .\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build +echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest. pause diff --git a/.ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat b/.ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat index 38f06ecb2..32611e4af 100644 --- a/.ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat +++ b/.ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat @@ -1,2 +1,3 @@ .\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation +echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest. pause From 8817f8fc148c5a63ffd3f854975df8e72c740540 Mon Sep 17 00:00:00 2001 From: contentis Date: Tue, 28 Oct 2025 21:20:53 +0100 Subject: [PATCH 502/759] Mixed Precision Quantization System (#10498) * Implement mixed precision operations with a registry design and metadate for quant spec in checkpoint. * Updated design using Tensor Subclasses * Fix FP8 MM * An actually functional POC * Remove CK reference and ensure correct compute dtype * Update unit tests * ruff lint * Implement mixed precision operations with a registry design and metadate for quant spec in checkpoint. * Updated design using Tensor Subclasses * Fix FP8 MM * An actually functional POC * Remove CK reference and ensure correct compute dtype * Update unit tests * ruff lint * Fix missing keys * Rename quant dtype parameter * Rename quant dtype parameter * Fix unittests for CPU build --- comfy/model_base.py | 10 +- comfy/model_detection.py | 20 + comfy/ops.py | 146 +++++- comfy/quant_ops.py | 437 ++++++++++++++++++ comfy/sd.py | 13 +- comfy/supported_models_base.py | 1 + .../comfy_quant/test_mixed_precision.py | 232 ++++++++++ tests-unit/comfy_quant/test_quant_registry.py | 190 ++++++++ 8 files changed, 1030 insertions(+), 19 deletions(-) create mode 100644 comfy/quant_ops.py create mode 100644 tests-unit/comfy_quant/test_mixed_precision.py create mode 100644 tests-unit/comfy_quant/test_quant_registry.py diff --git a/comfy/model_base.py b/comfy/model_base.py index e877f19ac..7c788d085 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -134,7 +134,7 @@ class BaseModel(torch.nn.Module): if not unet_config.get("disable_unet_model_creation", False): if model_config.custom_operations is None: fp8 = model_config.optimizations.get("fp8", False) - operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8) + operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8, model_config=model_config) else: operations = model_config.custom_operations self.diffusion_model = unet_model(**unet_config, device=device, operations=operations) @@ -333,6 +333,14 @@ class BaseModel(torch.nn.Module): if self.model_config.scaled_fp8 is not None: unet_state_dict["scaled_fp8"] = torch.tensor([], dtype=self.model_config.scaled_fp8) + # Save mixed precision metadata + if hasattr(self.model_config, 'layer_quant_config') and self.model_config.layer_quant_config: + metadata = { + "format_version": "1.0", + "layers": self.model_config.layer_quant_config + } + unet_state_dict["_quantization_metadata"] = metadata + unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict) if self.model_type == ModelType.V_PREDICTION: diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 141f1e164..3142a7fc3 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -6,6 +6,20 @@ import math import logging import torch + +def detect_layer_quantization(metadata): + quant_key = "_quantization_metadata" + if metadata is not None and quant_key in metadata: + quant_metadata = metadata.pop(quant_key) + quant_metadata = json.loads(quant_metadata) + if isinstance(quant_metadata, dict) and "layers" in quant_metadata: + logging.info(f"Found quantization metadata (version {quant_metadata.get('format_version', 'unknown')})") + return quant_metadata["layers"] + else: + raise ValueError("Invalid quantization metadata format") + return None + + def count_blocks(state_dict_keys, prefix_string): count = 0 while True: @@ -701,6 +715,12 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal else: model_config.optimizations["fp8"] = True + # Detect per-layer quantization (mixed precision) + layer_quant_config = detect_layer_quantization(metadata) + if layer_quant_config: + model_config.layer_quant_config = layer_quant_config + logging.info(f"Detected mixed precision quantization: {len(layer_quant_config)} layers quantized") + return model_config def unet_prefix_from_state_dict(state_dict): diff --git a/comfy/ops.py b/comfy/ops.py index 934e21261..93731eedf 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -344,6 +344,10 @@ class manual_cast(disable_weight_init): def fp8_linear(self, input): + """ + Legacy FP8 linear function for backward compatibility. + Uses QuantizedTensor subclass for dispatch. + """ dtype = self.weight.dtype if dtype not in [torch.float8_e4m3fn]: return None @@ -355,9 +359,9 @@ def fp8_linear(self, input): input_shape = input.shape input_dtype = input.dtype + if len(input.shape) == 3: w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype) - w = w.t() scale_weight = self.scale_weight scale_input = self.scale_input @@ -368,23 +372,18 @@ def fp8_linear(self, input): if scale_input is None: scale_input = torch.ones((), device=input.device, dtype=torch.float32) - input = torch.clamp(input, min=-448, max=448, out=input) - input = input.reshape(-1, input_shape[2]).to(dtype).contiguous() else: scale_input = scale_input.to(input.device) - input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype).contiguous() - if bias is not None: - o = torch._scaled_mm(input, w, out_dtype=input_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight) - else: - o = torch._scaled_mm(input, w, out_dtype=input_dtype, scale_a=scale_input, scale_b=scale_weight) - - if isinstance(o, tuple): - o = o[0] + # Wrap weight in QuantizedTensor - this enables unified dispatch + # Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py! + layout_params_weight = {'scale': scale_weight, 'orig_dtype': input_dtype} + quantized_weight = QuantizedTensor(w, TensorCoreFP8Layout, layout_params_weight) + quantized_input = QuantizedTensor.from_float(input.reshape(-1, input_shape[2]), TensorCoreFP8Layout, scale=scale_input, dtype=dtype) + o = torch.nn.functional.linear(quantized_input, quantized_weight, bias) if tensor_2d: return o.reshape(input_shape[0], -1) - return o.reshape((-1, input_shape[1], self.weight.shape[0])) return None @@ -478,7 +477,128 @@ if CUBLAS_IS_AVAILABLE: def forward(self, *args, **kwargs): return super().forward(*args, **kwargs) -def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None): + +# ============================================================================== +# Mixed Precision Operations +# ============================================================================== +from .quant_ops import QuantizedTensor, TensorCoreFP8Layout + +QUANT_FORMAT_MIXINS = { + "float8_e4m3fn": { + "dtype": torch.float8_e4m3fn, + "layout_type": TensorCoreFP8Layout, + "parameters": { + "weight_scale": torch.nn.Parameter(torch.zeros((), dtype=torch.float32), requires_grad=False), + "input_scale": torch.nn.Parameter(torch.zeros((), dtype=torch.float32), requires_grad=False), + } + } +} + +class MixedPrecisionOps(disable_weight_init): + _layer_quant_config = {} + _compute_dtype = torch.bfloat16 + + class Linear(torch.nn.Module, CastWeightBiasOp): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + super().__init__() + + self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype} + # self.factory_kwargs = {"device": device, "dtype": dtype} + + self.in_features = in_features + self.out_features = out_features + if bias: + self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs)) + else: + self.register_parameter("bias", None) + + self.tensor_class = None + + def reset_parameters(self): + return None + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, error_msgs): + + device = self.factory_kwargs["device"] + layer_name = prefix.rstrip('.') + weight_key = f"{prefix}weight" + weight = state_dict.pop(weight_key, None) + if weight is None: + raise ValueError(f"Missing weight for layer {layer_name}") + + manually_loaded_keys = [weight_key] + + if layer_name not in MixedPrecisionOps._layer_quant_config: + self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False) + else: + quant_format = MixedPrecisionOps._layer_quant_config[layer_name].get("format", None) + if quant_format is None: + raise ValueError(f"Unknown quantization format for layer {layer_name}") + + mixin = QUANT_FORMAT_MIXINS[quant_format] + self.layout_type = mixin["layout_type"] + + scale_key = f"{prefix}weight_scale" + layout_params = { + 'scale': state_dict.pop(scale_key, None), + 'orig_dtype': MixedPrecisionOps._compute_dtype + } + if layout_params['scale'] is not None: + manually_loaded_keys.append(scale_key) + + self.weight = torch.nn.Parameter( + QuantizedTensor(weight.to(device=device, dtype=mixin["dtype"]), self.layout_type, layout_params), + requires_grad=False + ) + + for param_name, param_value in mixin["parameters"].items(): + param_key = f"{prefix}{param_name}" + _v = state_dict.pop(param_key, None) + if _v is None: + continue + setattr(self, param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False)) + manually_loaded_keys.append(param_key) + + super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) + + for key in manually_loaded_keys: + if key in missing_keys: + missing_keys.remove(key) + + def _forward(self, input, weight, bias): + return torch.nn.functional.linear(input, weight, bias) + + def forward_comfy_cast_weights(self, input): + weight, bias = cast_bias_weight(self, input) + return self._forward(input, weight, bias) + + def forward(self, input, *args, **kwargs): + run_every_op() + + if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: + return self.forward_comfy_cast_weights(input, *args, **kwargs) + if (getattr(self, 'layout_type', None) is not None and + getattr(self, 'input_scale', None) is not None and + not isinstance(input, QuantizedTensor)): + input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, fp8_dtype=self.weight.dtype) + return self._forward(input, self.weight, self.bias) + + +def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None, model_config=None): + if model_config and hasattr(model_config, 'layer_quant_config') and model_config.layer_quant_config: + MixedPrecisionOps._layer_quant_config = model_config.layer_quant_config + MixedPrecisionOps._compute_dtype = compute_dtype + logging.info(f"Using mixed precision operations: {len(model_config.layer_quant_config)} quantized layers") + return MixedPrecisionOps + fp8_compute = comfy.model_management.supports_fp8_compute(load_device) if scaled_fp8 is not None: return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py new file mode 100644 index 000000000..b14e03084 --- /dev/null +++ b/comfy/quant_ops.py @@ -0,0 +1,437 @@ +import torch +import logging +from typing import Tuple, Dict + +_LAYOUT_REGISTRY = {} +_GENERIC_UTILS = {} + + +def register_layout_op(torch_op, layout_type): + """ + Decorator to register a layout-specific operation handler. + Args: + torch_op: PyTorch operation (e.g., torch.ops.aten.linear.default) + layout_type: Layout class (e.g., TensorCoreFP8Layout) + Example: + @register_layout_op(torch.ops.aten.linear.default, TensorCoreFP8Layout) + def fp8_linear(func, args, kwargs): + # FP8-specific linear implementation + ... + """ + def decorator(handler_func): + if torch_op not in _LAYOUT_REGISTRY: + _LAYOUT_REGISTRY[torch_op] = {} + _LAYOUT_REGISTRY[torch_op][layout_type] = handler_func + return handler_func + return decorator + + +def register_generic_util(torch_op): + """ + Decorator to register a generic utility that works for all layouts. + Args: + torch_op: PyTorch operation (e.g., torch.ops.aten.detach.default) + + Example: + @register_generic_util(torch.ops.aten.detach.default) + def generic_detach(func, args, kwargs): + # Works for any layout + ... + """ + def decorator(handler_func): + _GENERIC_UTILS[torch_op] = handler_func + return handler_func + return decorator + + +def _get_layout_from_args(args): + for arg in args: + if isinstance(arg, QuantizedTensor): + return arg._layout_type + elif isinstance(arg, (list, tuple)): + for item in arg: + if isinstance(item, QuantizedTensor): + return item._layout_type + return None + + +def _move_layout_params_to_device(params, device): + new_params = {} + for k, v in params.items(): + if isinstance(v, torch.Tensor): + new_params[k] = v.to(device=device) + else: + new_params[k] = v + return new_params + + +def _copy_layout_params(params): + new_params = {} + for k, v in params.items(): + if isinstance(v, torch.Tensor): + new_params[k] = v.clone() + else: + new_params[k] = v + return new_params + + +class QuantizedLayout: + """ + Base class for quantization layouts. + + A layout encapsulates the format-specific logic for quantization/dequantization + and provides a uniform interface for extracting raw tensors needed for computation. + + New quantization formats should subclass this and implement the required methods. + """ + @classmethod + def quantize(cls, tensor, **kwargs) -> Tuple[torch.Tensor, Dict]: + raise NotImplementedError(f"{cls.__name__} must implement quantize()") + + @staticmethod + def dequantize(qdata, **layout_params) -> torch.Tensor: + raise NotImplementedError("TensorLayout must implement dequantize()") + + @classmethod + def get_plain_tensors(cls, qtensor) -> torch.Tensor: + raise NotImplementedError(f"{cls.__name__} must implement get_plain_tensors()") + + +class QuantizedTensor(torch.Tensor): + """ + Universal quantized tensor that works with any layout. + + This tensor subclass uses a pluggable layout system to support multiple + quantization formats (FP8, INT4, INT8, etc.) without code duplication. + + The layout_type determines format-specific behavior, while common operations + (detach, clone, to) are handled generically. + + Attributes: + _qdata: The quantized tensor data + _layout_type: Layout class (e.g., TensorCoreFP8Layout) + _layout_params: Dict with layout-specific params (scale, zero_point, etc.) + """ + + @staticmethod + def __new__(cls, qdata, layout_type, layout_params): + """ + Create a quantized tensor. + + Args: + qdata: The quantized data tensor + layout_type: Layout class (subclass of QuantizedLayout) + layout_params: Dict with layout-specific parameters + """ + return torch.Tensor._make_subclass(cls, qdata, require_grad=False) + + def __init__(self, qdata, layout_type, layout_params): + self._qdata = qdata.contiguous() + self._layout_type = layout_type + self._layout_params = layout_params + + def __repr__(self): + layout_name = self._layout_type.__name__ + param_str = ", ".join(f"{k}={v}" for k, v in list(self._layout_params.items())[:2]) + return f"QuantizedTensor(shape={self.shape}, layout={layout_name}, {param_str})" + + @property + def layout_type(self): + return self._layout_type + + def __tensor_flatten__(self): + """ + Tensor flattening protocol for proper device movement. + """ + inner_tensors = ["_qdata"] + ctx = { + "layout_type": self._layout_type, + } + + tensor_params = {} + non_tensor_params = {} + for k, v in self._layout_params.items(): + if isinstance(v, torch.Tensor): + tensor_params[k] = v + else: + non_tensor_params[k] = v + + ctx["tensor_param_keys"] = list(tensor_params.keys()) + ctx["non_tensor_params"] = non_tensor_params + + for k, v in tensor_params.items(): + attr_name = f"_layout_param_{k}" + object.__setattr__(self, attr_name, v) + inner_tensors.append(attr_name) + + return inner_tensors, ctx + + @staticmethod + def __tensor_unflatten__(inner_tensors, ctx, outer_size, outer_stride): + """ + Tensor unflattening protocol for proper device movement. + Reconstructs the QuantizedTensor after device movement. + """ + layout_type = ctx["layout_type"] + layout_params = dict(ctx["non_tensor_params"]) + + for key in ctx["tensor_param_keys"]: + attr_name = f"_layout_param_{key}" + layout_params[key] = inner_tensors[attr_name] + + return QuantizedTensor(inner_tensors["_q_data"], layout_type, layout_params) + + @classmethod + def from_float(cls, tensor, layout_type, **quantize_kwargs) -> 'QuantizedTensor': + qdata, layout_params = layout_type.quantize(tensor, **quantize_kwargs) + return cls(qdata, layout_type, layout_params) + + def dequantize(self) -> torch.Tensor: + return self._layout_type.dequantize(self._qdata, **self._layout_params) + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + kwargs = kwargs or {} + + # Step 1: Check generic utilities first (detach, clone, to, etc.) + if func in _GENERIC_UTILS: + return _GENERIC_UTILS[func](func, args, kwargs) + + # Step 2: Check layout-specific handlers (linear, matmul, etc.) + layout_type = _get_layout_from_args(args) + if layout_type and func in _LAYOUT_REGISTRY: + handler = _LAYOUT_REGISTRY[func].get(layout_type) + if handler: + return handler(func, args, kwargs) + + # Step 3: Fallback to dequantization + if isinstance(args[0] if args else None, QuantizedTensor): + logging.info(f"QuantizedTensor: Unhandled operation {func}, falling back to dequantization. kwargs={kwargs}") + return cls._dequant_and_fallback(func, args, kwargs) + + @classmethod + def _dequant_and_fallback(cls, func, args, kwargs): + def dequant_arg(arg): + if isinstance(arg, QuantizedTensor): + return arg.dequantize() + elif isinstance(arg, (list, tuple)): + return type(arg)(dequant_arg(a) for a in arg) + return arg + + new_args = dequant_arg(args) + new_kwargs = dequant_arg(kwargs) + return func(*new_args, **new_kwargs) + + +# ============================================================================== +# Generic Utilities (Layout-Agnostic Operations) +# ============================================================================== + +def _create_transformed_qtensor(qt, transform_fn): + new_data = transform_fn(qt._qdata) + new_params = _copy_layout_params(qt._layout_params) + return QuantizedTensor(new_data, qt._layout_type, new_params) + + +def _handle_device_transfer(qt, target_device, target_dtype=None, target_layout=None, op_name="to"): + if target_dtype is not None and target_dtype != qt.dtype: + logging.warning( + f"QuantizedTensor: dtype conversion requested to {target_dtype}, " + f"but not supported for quantized tensors. Ignoring dtype." + ) + + if target_layout is not None and target_layout != torch.strided: + logging.warning( + f"QuantizedTensor: layout change requested to {target_layout}, " + f"but not supported. Ignoring layout." + ) + + # Handle device transfer + current_device = qt._qdata.device + if target_device is not None: + # Normalize device for comparison + if isinstance(target_device, str): + target_device = torch.device(target_device) + if isinstance(current_device, str): + current_device = torch.device(current_device) + + if target_device != current_device: + logging.debug(f"QuantizedTensor.{op_name}: Moving from {current_device} to {target_device}") + new_q_data = qt._qdata.to(device=target_device) + new_params = _move_layout_params_to_device(qt._layout_params, target_device) + new_qt = QuantizedTensor(new_q_data, qt._layout_type, new_params) + logging.debug(f"QuantizedTensor.{op_name}: Created new tensor on {target_device}") + return new_qt + + logging.debug(f"QuantizedTensor.{op_name}: No device change needed, returning original") + return qt + + +@register_generic_util(torch.ops.aten.detach.default) +def generic_detach(func, args, kwargs): + """Detach operation - creates a detached copy of the quantized tensor.""" + qt = args[0] + if isinstance(qt, QuantizedTensor): + return _create_transformed_qtensor(qt, lambda x: x.detach()) + return func(*args, **kwargs) + + +@register_generic_util(torch.ops.aten.clone.default) +def generic_clone(func, args, kwargs): + """Clone operation - creates a deep copy of the quantized tensor.""" + qt = args[0] + if isinstance(qt, QuantizedTensor): + return _create_transformed_qtensor(qt, lambda x: x.clone()) + return func(*args, **kwargs) + + +@register_generic_util(torch.ops.aten._to_copy.default) +def generic_to_copy(func, args, kwargs): + """Device/dtype transfer operation - handles .to(device) calls.""" + qt = args[0] + if isinstance(qt, QuantizedTensor): + return _handle_device_transfer( + qt, + target_device=kwargs.get('device', None), + target_dtype=kwargs.get('dtype', None), + op_name="_to_copy" + ) + return func(*args, **kwargs) + + +@register_generic_util(torch.ops.aten.to.dtype_layout) +def generic_to_dtype_layout(func, args, kwargs): + """Handle .to(device) calls using the dtype_layout variant.""" + qt = args[0] + if isinstance(qt, QuantizedTensor): + return _handle_device_transfer( + qt, + target_device=kwargs.get('device', None), + target_dtype=kwargs.get('dtype', None), + target_layout=kwargs.get('layout', None), + op_name="to" + ) + return func(*args, **kwargs) + + +@register_generic_util(torch.ops.aten.copy_.default) +def generic_copy_(func, args, kwargs): + qt_dest = args[0] + src = args[1] + + if isinstance(qt_dest, QuantizedTensor): + if isinstance(src, QuantizedTensor): + # Copy from another quantized tensor + qt_dest._qdata.copy_(src._qdata) + qt_dest._layout_type = src._layout_type + qt_dest._layout_params = _copy_layout_params(src._layout_params) + else: + # Copy from regular tensor - just copy raw data + qt_dest._qdata.copy_(src) + return qt_dest + return func(*args, **kwargs) + + +@register_generic_util(torch.ops.aten._has_compatible_shallow_copy_type.default) +def generic_has_compatible_shallow_copy_type(func, args, kwargs): + return True + +# ============================================================================== +# FP8 Layout + Operation Handlers +# ============================================================================== +class TensorCoreFP8Layout(QuantizedLayout): + """ + Storage format: + - qdata: FP8 tensor (torch.float8_e4m3fn or torch.float8_e5m2) + - scale: Scalar tensor (float32) for dequantization + - orig_dtype: Original dtype before quantization (for casting back) + """ + @classmethod + def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn): + orig_dtype = tensor.dtype + + if scale is None: + scale = torch.amax(tensor.abs()) / torch.finfo(dtype).max + + if not isinstance(scale, torch.Tensor): + scale = torch.tensor(scale) + scale = scale.to(device=tensor.device, dtype=torch.float32) + + lp_amax = torch.finfo(dtype).max + tensor_scaled = tensor.float() / scale + torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled) + qdata = tensor_scaled.to(dtype, memory_format=torch.contiguous_format) + + layout_params = { + 'scale': scale, + 'orig_dtype': orig_dtype + } + return qdata, layout_params + + @staticmethod + def dequantize(qdata, scale, orig_dtype, **kwargs): + plain_tensor = torch.ops.aten._to_copy.default(qdata, dtype=orig_dtype) + return plain_tensor * scale + + @classmethod + def get_plain_tensors(cls, qtensor): + return qtensor._qdata, qtensor._layout_params['scale'] + + +@register_layout_op(torch.ops.aten.linear.default, TensorCoreFP8Layout) +def fp8_linear(func, args, kwargs): + input_tensor = args[0] + weight = args[1] + bias = args[2] if len(args) > 2 else None + + if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor): + plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor) + plain_weight, scale_b = TensorCoreFP8Layout.get_plain_tensors(weight) + + out_dtype = kwargs.get("out_dtype") + if out_dtype is None: + out_dtype = input_tensor._layout_params['orig_dtype'] + + weight_t = plain_weight.t() + + tensor_2d = False + if len(plain_input.shape) == 2: + tensor_2d = True + plain_input = plain_input.unsqueeze(1) + + input_shape = plain_input.shape + if len(input_shape) != 3: + return None + + try: + output = torch._scaled_mm( + plain_input.reshape(-1, input_shape[2]), + weight_t, + bias=bias, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=out_dtype, + ) + if not tensor_2d: + output = output.reshape((-1, input_shape[1], weight.shape[0])) + + if output.dtype in [torch.float8_e4m3fn, torch.float8_e5m2]: + output_scale = scale_a * scale_b + output_params = { + 'scale': output_scale, + 'orig_dtype': input_tensor._layout_params['orig_dtype'] + } + return QuantizedTensor(output, TensorCoreFP8Layout, output_params) + else: + return output + + except Exception as e: + raise RuntimeError(f"FP8 _scaled_mm failed, falling back to dequantization: {e}") + + # Case 2: DQ Fallback + if isinstance(weight, QuantizedTensor): + weight = weight.dequantize() + if isinstance(input_tensor, QuantizedTensor): + input_tensor = input_tensor.dequantize() + + return torch.nn.functional.linear(input_tensor, weight, bias) diff --git a/comfy/sd.py b/comfy/sd.py index 28bee248d..6411bb27d 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1262,7 +1262,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c return (model_patcher, clip, vae, clipvision) -def load_diffusion_model_state_dict(sd, model_options={}): +def load_diffusion_model_state_dict(sd, model_options={}, metadata=None): """ Loads a UNet diffusion model from a state dictionary, supporting both diffusers and regular formats. @@ -1296,7 +1296,7 @@ def load_diffusion_model_state_dict(sd, model_options={}): weight_dtype = comfy.utils.weight_dtype(sd) load_device = model_management.get_torch_device() - model_config = model_detection.model_config_from_unet(sd, "") + model_config = model_detection.model_config_from_unet(sd, "", metadata=metadata) if model_config is not None: new_sd = sd @@ -1330,7 +1330,10 @@ def load_diffusion_model_state_dict(sd, model_options={}): else: unet_dtype = dtype - manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes) + if hasattr(model_config, "layer_quant_config"): + manual_cast_dtype = model_management.unet_manual_cast(None, load_device, model_config.supported_inference_dtypes) + else: + manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes) model_config.set_inference_dtype(unet_dtype, manual_cast_dtype) model_config.custom_operations = model_options.get("custom_operations", model_config.custom_operations) if model_options.get("fp8_optimizations", False): @@ -1346,8 +1349,8 @@ def load_diffusion_model_state_dict(sd, model_options={}): def load_diffusion_model(unet_path, model_options={}): - sd = comfy.utils.load_torch_file(unet_path) - model = load_diffusion_model_state_dict(sd, model_options=model_options) + sd, metadata = comfy.utils.load_torch_file(unet_path, return_metadata=True) + model = load_diffusion_model_state_dict(sd, model_options=model_options, metadata=metadata) if model is None: logging.error("ERROR UNSUPPORTED DIFFUSION MODEL {}".format(unet_path)) raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(unet_path, model_detection_error_hint(unet_path, sd))) diff --git a/comfy/supported_models_base.py b/comfy/supported_models_base.py index 54573abb1..e4bd74514 100644 --- a/comfy/supported_models_base.py +++ b/comfy/supported_models_base.py @@ -50,6 +50,7 @@ class BASE: manual_cast_dtype = None custom_operations = None scaled_fp8 = None + layer_quant_config = None # Per-layer quantization configuration for mixed precision optimizations = {"fp8": False} @classmethod diff --git a/tests-unit/comfy_quant/test_mixed_precision.py b/tests-unit/comfy_quant/test_mixed_precision.py new file mode 100644 index 000000000..267bc177b --- /dev/null +++ b/tests-unit/comfy_quant/test_mixed_precision.py @@ -0,0 +1,232 @@ +import unittest +import torch +import sys +import os + +# Add comfy to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) + +def has_gpu(): + return torch.cuda.is_available() + +from comfy.cli_args import args +if not has_gpu(): + args.cpu = True + +from comfy import ops +from comfy.quant_ops import QuantizedTensor, TensorCoreFP8Layout + + +class SimpleModel(torch.nn.Module): + def __init__(self, operations=ops.disable_weight_init): + super().__init__() + self.layer1 = operations.Linear(10, 20, device="cpu", dtype=torch.bfloat16) + self.layer2 = operations.Linear(20, 30, device="cpu", dtype=torch.bfloat16) + self.layer3 = operations.Linear(30, 40, device="cpu", dtype=torch.bfloat16) + + def forward(self, x): + x = self.layer1(x) + x = torch.nn.functional.relu(x) + x = self.layer2(x) + x = torch.nn.functional.relu(x) + x = self.layer3(x) + return x + + +class TestMixedPrecisionOps(unittest.TestCase): + + def test_all_layers_standard(self): + """Test that model with no quantization works normally""" + # Configure no quantization + ops.MixedPrecisionOps._layer_quant_config = {} + + # Create model + model = SimpleModel(operations=ops.MixedPrecisionOps) + + # Initialize weights manually + model.layer1.weight = torch.nn.Parameter(torch.randn(20, 10, dtype=torch.bfloat16)) + model.layer1.bias = torch.nn.Parameter(torch.randn(20, dtype=torch.bfloat16)) + model.layer2.weight = torch.nn.Parameter(torch.randn(30, 20, dtype=torch.bfloat16)) + model.layer2.bias = torch.nn.Parameter(torch.randn(30, dtype=torch.bfloat16)) + model.layer3.weight = torch.nn.Parameter(torch.randn(40, 30, dtype=torch.bfloat16)) + model.layer3.bias = torch.nn.Parameter(torch.randn(40, dtype=torch.bfloat16)) + + # Initialize weight_function and bias_function + for layer in [model.layer1, model.layer2, model.layer3]: + layer.weight_function = [] + layer.bias_function = [] + + # Forward pass + input_tensor = torch.randn(5, 10, dtype=torch.bfloat16) + output = model(input_tensor) + + self.assertEqual(output.shape, (5, 40)) + self.assertEqual(output.dtype, torch.bfloat16) + + def test_mixed_precision_load(self): + """Test loading a mixed precision model from state dict""" + # Configure mixed precision: layer1 is FP8, layer2 and layer3 are standard + layer_quant_config = { + "layer1": { + "format": "float8_e4m3fn", + "params": {} + }, + "layer3": { + "format": "float8_e4m3fn", + "params": {} + } + } + ops.MixedPrecisionOps._layer_quant_config = layer_quant_config + + # Create state dict with mixed precision + fp8_weight1 = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn) + fp8_weight3 = torch.randn(40, 30, dtype=torch.float32).to(torch.float8_e4m3fn) + + state_dict = { + # Layer 1: FP8 E4M3FN + "layer1.weight": fp8_weight1, + "layer1.bias": torch.randn(20, dtype=torch.bfloat16), + "layer1.weight_scale": torch.tensor(2.0, dtype=torch.float32), + + # Layer 2: Standard BF16 + "layer2.weight": torch.randn(30, 20, dtype=torch.bfloat16), + "layer2.bias": torch.randn(30, dtype=torch.bfloat16), + + # Layer 3: FP8 E4M3FN + "layer3.weight": fp8_weight3, + "layer3.bias": torch.randn(40, dtype=torch.bfloat16), + "layer3.weight_scale": torch.tensor(1.5, dtype=torch.float32), + } + + # Create model and load state dict (strict=False because custom loading pops keys) + model = SimpleModel(operations=ops.MixedPrecisionOps) + model.load_state_dict(state_dict, strict=False) + + # Verify weights are wrapped in QuantizedTensor + self.assertIsInstance(model.layer1.weight, QuantizedTensor) + self.assertEqual(model.layer1.weight._layout_type, TensorCoreFP8Layout) + + # Layer 2 should NOT be quantized + self.assertNotIsInstance(model.layer2.weight, QuantizedTensor) + + # Layer 3 should be quantized + self.assertIsInstance(model.layer3.weight, QuantizedTensor) + self.assertEqual(model.layer3.weight._layout_type, TensorCoreFP8Layout) + + # Verify scales were loaded + self.assertEqual(model.layer1.weight._layout_params['scale'].item(), 2.0) + self.assertEqual(model.layer3.weight._layout_params['scale'].item(), 1.5) + + # Forward pass + input_tensor = torch.randn(5, 10, dtype=torch.bfloat16) + output = model(input_tensor) + + self.assertEqual(output.shape, (5, 40)) + + def test_state_dict_quantized_preserved(self): + """Test that quantized weights are preserved in state_dict()""" + # Configure mixed precision + layer_quant_config = { + "layer1": { + "format": "float8_e4m3fn", + "params": {} + } + } + ops.MixedPrecisionOps._layer_quant_config = layer_quant_config + + # Create and load model + fp8_weight = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn) + state_dict1 = { + "layer1.weight": fp8_weight, + "layer1.bias": torch.randn(20, dtype=torch.bfloat16), + "layer1.weight_scale": torch.tensor(3.0, dtype=torch.float32), + "layer2.weight": torch.randn(30, 20, dtype=torch.bfloat16), + "layer2.bias": torch.randn(30, dtype=torch.bfloat16), + "layer3.weight": torch.randn(40, 30, dtype=torch.bfloat16), + "layer3.bias": torch.randn(40, dtype=torch.bfloat16), + } + + model = SimpleModel(operations=ops.MixedPrecisionOps) + model.load_state_dict(state_dict1, strict=False) + + # Save state dict + state_dict2 = model.state_dict() + + # Verify layer1.weight is a QuantizedTensor with scale preserved + self.assertIsInstance(state_dict2["layer1.weight"], QuantizedTensor) + self.assertEqual(state_dict2["layer1.weight"]._layout_params['scale'].item(), 3.0) + self.assertEqual(state_dict2["layer1.weight"]._layout_type, TensorCoreFP8Layout) + + # Verify non-quantized layers are standard tensors + self.assertNotIsInstance(state_dict2["layer2.weight"], QuantizedTensor) + self.assertNotIsInstance(state_dict2["layer3.weight"], QuantizedTensor) + + def test_weight_function_compatibility(self): + """Test that weight_function (LoRA) works with quantized layers""" + # Configure FP8 quantization + layer_quant_config = { + "layer1": { + "format": "float8_e4m3fn", + "params": {} + } + } + ops.MixedPrecisionOps._layer_quant_config = layer_quant_config + + # Create and load model + fp8_weight = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn) + state_dict = { + "layer1.weight": fp8_weight, + "layer1.bias": torch.randn(20, dtype=torch.bfloat16), + "layer1.weight_scale": torch.tensor(2.0, dtype=torch.float32), + "layer2.weight": torch.randn(30, 20, dtype=torch.bfloat16), + "layer2.bias": torch.randn(30, dtype=torch.bfloat16), + "layer3.weight": torch.randn(40, 30, dtype=torch.bfloat16), + "layer3.bias": torch.randn(40, dtype=torch.bfloat16), + } + + model = SimpleModel(operations=ops.MixedPrecisionOps) + model.load_state_dict(state_dict, strict=False) + + # Add a weight function (simulating LoRA) + # This should trigger dequantization during forward pass + def apply_lora(weight): + lora_delta = torch.randn_like(weight) * 0.01 + return weight + lora_delta + + model.layer1.weight_function.append(apply_lora) + + # Forward pass should work with LoRA (triggers weight_function path) + input_tensor = torch.randn(5, 10, dtype=torch.bfloat16) + output = model(input_tensor) + + self.assertEqual(output.shape, (5, 40)) + + def test_error_handling_unknown_format(self): + """Test that unknown formats raise error""" + # Configure with unknown format + layer_quant_config = { + "layer1": { + "format": "unknown_format_xyz", + "params": {} + } + } + ops.MixedPrecisionOps._layer_quant_config = layer_quant_config + + # Create state dict + state_dict = { + "layer1.weight": torch.randn(20, 10, dtype=torch.bfloat16), + "layer1.bias": torch.randn(20, dtype=torch.bfloat16), + "layer2.weight": torch.randn(30, 20, dtype=torch.bfloat16), + "layer2.bias": torch.randn(30, dtype=torch.bfloat16), + "layer3.weight": torch.randn(40, 30, dtype=torch.bfloat16), + "layer3.bias": torch.randn(40, dtype=torch.bfloat16), + } + + # Load should raise KeyError for unknown format in QUANT_FORMAT_MIXINS + model = SimpleModel(operations=ops.MixedPrecisionOps) + with self.assertRaises(KeyError): + model.load_state_dict(state_dict, strict=False) + +if __name__ == "__main__": + unittest.main() + diff --git a/tests-unit/comfy_quant/test_quant_registry.py b/tests-unit/comfy_quant/test_quant_registry.py new file mode 100644 index 000000000..477811029 --- /dev/null +++ b/tests-unit/comfy_quant/test_quant_registry.py @@ -0,0 +1,190 @@ +import unittest +import torch +import sys +import os + +# Add comfy to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) + +def has_gpu(): + return torch.cuda.is_available() + +from comfy.cli_args import args +if not has_gpu(): + args.cpu = True + +from comfy.quant_ops import QuantizedTensor, TensorCoreFP8Layout + + +class TestQuantizedTensor(unittest.TestCase): + """Test the QuantizedTensor subclass with FP8 layout""" + + def test_creation(self): + """Test creating a QuantizedTensor with TensorCoreFP8Layout""" + fp8_data = torch.randn(256, 128, dtype=torch.float32).to(torch.float8_e4m3fn) + scale = torch.tensor(2.0) + layout_params = {'scale': scale, 'orig_dtype': torch.bfloat16} + + qt = QuantizedTensor(fp8_data, TensorCoreFP8Layout, layout_params) + + self.assertIsInstance(qt, QuantizedTensor) + self.assertEqual(qt.shape, (256, 128)) + self.assertEqual(qt.dtype, torch.float8_e4m3fn) + self.assertEqual(qt._layout_params['scale'], scale) + self.assertEqual(qt._layout_params['orig_dtype'], torch.bfloat16) + self.assertEqual(qt._layout_type, TensorCoreFP8Layout) + + def test_dequantize(self): + """Test explicit dequantization""" + + fp8_data = torch.ones(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn) + scale = torch.tensor(3.0) + layout_params = {'scale': scale, 'orig_dtype': torch.float32} + + qt = QuantizedTensor(fp8_data, TensorCoreFP8Layout, layout_params) + dequantized = qt.dequantize() + + self.assertEqual(dequantized.dtype, torch.float32) + self.assertTrue(torch.allclose(dequantized, torch.ones(10, 20) * 3.0, rtol=0.1)) + + def test_from_float(self): + """Test creating QuantizedTensor from float tensor""" + float_tensor = torch.randn(64, 32, dtype=torch.float32) + scale = torch.tensor(1.5) + + qt = QuantizedTensor.from_float( + float_tensor, + TensorCoreFP8Layout, + scale=scale, + dtype=torch.float8_e4m3fn + ) + + self.assertIsInstance(qt, QuantizedTensor) + self.assertEqual(qt.dtype, torch.float8_e4m3fn) + self.assertEqual(qt.shape, (64, 32)) + + # Verify dequantization gives approximately original values + dequantized = qt.dequantize() + mean_rel_error = ((dequantized - float_tensor).abs() / (float_tensor.abs() + 1e-6)).mean() + self.assertLess(mean_rel_error, 0.1) + + +class TestGenericUtilities(unittest.TestCase): + """Test generic utility operations""" + + def test_detach(self): + """Test detach operation on quantized tensor""" + fp8_data = torch.randn(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn) + scale = torch.tensor(1.5) + layout_params = {'scale': scale, 'orig_dtype': torch.float32} + qt = QuantizedTensor(fp8_data, TensorCoreFP8Layout, layout_params) + + # Detach should return a new QuantizedTensor + qt_detached = qt.detach() + + self.assertIsInstance(qt_detached, QuantizedTensor) + self.assertEqual(qt_detached.shape, qt.shape) + self.assertEqual(qt_detached._layout_type, TensorCoreFP8Layout) + + def test_clone(self): + """Test clone operation on quantized tensor""" + fp8_data = torch.randn(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn) + scale = torch.tensor(1.5) + layout_params = {'scale': scale, 'orig_dtype': torch.float32} + qt = QuantizedTensor(fp8_data, TensorCoreFP8Layout, layout_params) + + # Clone should return a new QuantizedTensor + qt_cloned = qt.clone() + + self.assertIsInstance(qt_cloned, QuantizedTensor) + self.assertEqual(qt_cloned.shape, qt.shape) + self.assertEqual(qt_cloned._layout_type, TensorCoreFP8Layout) + + # Verify it's a deep copy + self.assertIsNot(qt_cloned._qdata, qt._qdata) + + @unittest.skipUnless(has_gpu(), "GPU not available") + def test_to_device(self): + """Test device transfer""" + fp8_data = torch.randn(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn) + scale = torch.tensor(1.5) + layout_params = {'scale': scale, 'orig_dtype': torch.float32} + qt = QuantizedTensor(fp8_data, TensorCoreFP8Layout, layout_params) + + # Moving to same device should work (CPU to CPU) + qt_cpu = qt.to('cpu') + + self.assertIsInstance(qt_cpu, QuantizedTensor) + self.assertEqual(qt_cpu.device.type, 'cpu') + self.assertEqual(qt_cpu._layout_params['scale'].device.type, 'cpu') + + +class TestTensorCoreFP8Layout(unittest.TestCase): + """Test the TensorCoreFP8Layout implementation""" + + def test_quantize(self): + """Test quantization method""" + float_tensor = torch.randn(32, 64, dtype=torch.float32) + scale = torch.tensor(1.5) + + qdata, layout_params = TensorCoreFP8Layout.quantize( + float_tensor, + scale=scale, + dtype=torch.float8_e4m3fn + ) + + self.assertEqual(qdata.dtype, torch.float8_e4m3fn) + self.assertEqual(qdata.shape, float_tensor.shape) + self.assertIn('scale', layout_params) + self.assertIn('orig_dtype', layout_params) + self.assertEqual(layout_params['orig_dtype'], torch.float32) + + def test_dequantize(self): + """Test dequantization method""" + float_tensor = torch.ones(10, 20, dtype=torch.float32) * 3.0 + scale = torch.tensor(1.0) + + qdata, layout_params = TensorCoreFP8Layout.quantize( + float_tensor, + scale=scale, + dtype=torch.float8_e4m3fn + ) + + dequantized = TensorCoreFP8Layout.dequantize(qdata, **layout_params) + + # Should approximately match original + self.assertTrue(torch.allclose(dequantized, float_tensor, rtol=0.1, atol=0.1)) + + +class TestFallbackMechanism(unittest.TestCase): + """Test fallback for unsupported operations""" + + def test_unsupported_op_dequantizes(self): + """Test that unsupported operations fall back to dequantization""" + # Set seed for reproducibility + torch.manual_seed(42) + + # Create quantized tensor + a_fp32 = torch.randn(10, 20, dtype=torch.float32) + scale = torch.tensor(1.0) + a_q = QuantizedTensor.from_float( + a_fp32, + TensorCoreFP8Layout, + scale=scale, + dtype=torch.float8_e4m3fn + ) + + # Call an operation that doesn't have a registered handler + # For example, torch.abs + result = torch.abs(a_q) + + # Should work via fallback (dequantize → abs → return) + self.assertNotIsInstance(result, QuantizedTensor) + expected = torch.abs(a_fp32) + # FP8 introduces quantization error, so use loose tolerance + mean_error = (result - expected).abs().mean() + self.assertLess(mean_error, 0.05, f"Mean error {mean_error:.4f} is too large") + + +if __name__ == "__main__": + unittest.main() From d202c2ba7404affd58a2199aeb514b3cc48e0ef3 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Wed, 29 Oct 2025 06:22:08 +1000 Subject: [PATCH 503/759] execution: Allow a subgraph nodes to execute multiple times (#10499) In the case of --cache-none lazy and subgraph execution can cause anything to be run multiple times per workflow. If that rerun nodes is in itself a subgraph generator, this will crash for two reasons. pending_subgraph_results[] does not cleanup entries after their use. So when a pending_subgraph_result is consumed, remove it from the list so that if the corresponding node is fully re-executed this misses lookup and it fall through to execute the node as it should. Secondly, theres is an explicit enforcement against dups in the addition of subgraphs nodes as ephemerals to the dymprompt. Remove this enforcement as the use case is now valid. --- execution.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/execution.py b/execution.py index b14bb14c7..20e106213 100644 --- a/execution.py +++ b/execution.py @@ -445,6 +445,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, resolved_outputs.append(tuple(resolved_output)) output_data = merge_result_data(resolved_outputs, class_def) output_ui = [] + del pending_subgraph_results[unique_id] has_subgraph = False else: get_progress_state().start_progress(unique_id) @@ -527,10 +528,6 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, if new_graph is None: cached_outputs.append((False, node_outputs)) else: - # Check for conflicts - for node_id in new_graph.keys(): - if dynprompt.has_node(node_id): - raise DuplicateNodeError(f"Attempt to add duplicate node {node_id}. Ensure node ids are unique and deterministic or use graph_utils.GraphBuilder.") for node_id, node_info in new_graph.items(): new_node_ids.append(node_id) display_id = node_info.get("override_display_id", unique_id) From 210f7a1ba580d57d817ca68346cb72b8d0a26ad2 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 28 Oct 2025 23:38:05 +0200 Subject: [PATCH 504/759] convert nodes_recraft.py to V3 schema (#10507) --- comfy_api_nodes/nodes_recraft.py | 1319 +++++++++++++----------------- 1 file changed, 585 insertions(+), 734 deletions(-) diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index 8ee7e55c4..dee186cd6 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -1,82 +1,71 @@ -from __future__ import annotations -from inspect import cleandoc -from typing import Optional +from io import BytesIO +from typing import Optional, Union + +import aiohttp +import torch +from PIL import UnidentifiedImageError +from typing_extensions import override + from comfy.utils import ProgressBar -from comfy_extras.nodes_images import SVG # Added -from comfy.comfy_types.node_typing import IO +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.apinode_utils import ( + resize_mask_to_image, +) from comfy_api_nodes.apis.recraft_api import ( - RecraftImageGenerationRequest, - RecraftImageGenerationResponse, - RecraftImageSize, - RecraftModel, - RecraftStyle, - RecraftStyleV3, RecraftColor, RecraftColorChain, RecraftControls, + RecraftImageGenerationRequest, + RecraftImageGenerationResponse, + RecraftImageSize, RecraftIO, + RecraftModel, + RecraftStyle, + RecraftStyleV3, get_v3_substyles, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - EmptyRequest, + bytesio_to_image_tensor, + download_url_as_bytesio, + sync_op, + tensor_to_bytesio, + validate_string, ) -from comfy_api_nodes.apinode_utils import ( - download_url_to_bytesio, - resize_mask_to_image, -) -from comfy_api_nodes.util import validate_string, tensor_to_bytesio, bytesio_to_image_tensor -from server import PromptServer - -import torch -from io import BytesIO -from PIL import UnidentifiedImageError -import aiohttp +from comfy_extras.nodes_images import SVG async def handle_recraft_file_request( + cls: type[IO.ComfyNode], image: torch.Tensor, path: str, - mask: torch.Tensor=None, - total_pixels=4096*4096, - timeout=1024, + mask: Optional[torch.Tensor] = None, + total_pixels: int = 4096 * 4096, + timeout: int = 1024, request=None, - auth_kwargs: dict[str,str] = None, ) -> list[BytesIO]: - """ - Handle sending common Recraft file-only request to get back file bytes. - """ - if request is None: - request = EmptyRequest() + """Handle sending common Recraft file-only request to get back file bytes.""" - files = { - 'image': tensor_to_bytesio(image, total_pixels=total_pixels).read() - } + files = {"image": tensor_to_bytesio(image, total_pixels=total_pixels).read()} if mask is not None: - files['mask'] = tensor_to_bytesio(mask, total_pixels=total_pixels).read() + files["mask"] = tensor_to_bytesio(mask, total_pixels=total_pixels).read() - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=type(request), - response_model=RecraftImageGenerationResponse, - ), - request=request, + response = await sync_op( + cls, + endpoint=ApiEndpoint(path=path, method="POST"), + response_model=RecraftImageGenerationResponse, + data=request if request else None, files=files, content_type="multipart/form-data", - auth_kwargs=auth_kwargs, multipart_parser=recraft_multipart_parser, + max_retries=1, ) - response: RecraftImageGenerationResponse = await operation.execute() all_bytesio = [] if response.image is not None: - all_bytesio.append(await download_url_to_bytesio(response.image.url, timeout=timeout)) + all_bytesio.append(await download_url_as_bytesio(response.image.url, timeout=timeout)) else: for data in response.data: - all_bytesio.append(await download_url_to_bytesio(data.url, timeout=timeout)) + all_bytesio.append(await download_url_as_bytesio(data.url, timeout=timeout)) return all_bytesio @@ -84,11 +73,11 @@ async def handle_recraft_file_request( def recraft_multipart_parser( data, parent_key=None, - formatter: callable = None, - converted_to_check: list[list] = None, + formatter: Optional[type[callable]] = None, + converted_to_check: Optional[list[list]] = None, is_list: bool = False, - return_mode: str = "formdata" # "dict" | "formdata" -) -> dict | aiohttp.FormData: + return_mode: str = "formdata", # "dict" | "formdata" +) -> Union[dict, aiohttp.FormData]: """ Formats data such that multipart/form-data will work with aiohttp library when both files and data are present. @@ -108,8 +97,8 @@ def recraft_multipart_parser( # Modification of a function that handled a different type of multipart parsing, big ups: # https://gist.github.com/kazqvaizer/4cebebe5db654a414132809f9f88067b - def handle_converted_lists(item, parent_key, lists_to_check=tuple[list]): - # if list already exists exists, just extend list with data + def handle_converted_lists(item, parent_key, lists_to_check=list[list]): + # if list already exists, just extend list with data for check_list in lists_to_check: for conv_tuple in check_list: if conv_tuple[0] == parent_key and isinstance(conv_tuple[1], list): @@ -125,7 +114,7 @@ def recraft_multipart_parser( formatter = lambda v: v # Multipart representation of value if not isinstance(data, dict): - # if list already exists exists, just extend list with data + # if list already exists, just extend list with data added = handle_converted_lists(data, parent_key, converted_to_check) if added: return {} @@ -146,7 +135,9 @@ def recraft_multipart_parser( elif isinstance(value, list): for ind, list_value in enumerate(value): iter_key = f"{current_key}[]" - converted.extend(recraft_multipart_parser(list_value, iter_key, formatter, next_check, is_list=True).items()) + converted.extend( + recraft_multipart_parser(list_value, iter_key, formatter, next_check, is_list=True).items() + ) else: converted.append((current_key, formatter(value))) @@ -166,6 +157,7 @@ class handle_recraft_image_output: """ Catch an exception related to receiving SVG data instead of image, when Infinite Style Library style_id is in use. """ + def __init__(self): pass @@ -174,243 +166,225 @@ class handle_recraft_image_output: def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is not None and exc_type is UnidentifiedImageError: - raise Exception("Received output data was not an image; likely an SVG. If you used style_id, make sure it is not a Vector art style.") + raise Exception( + "Received output data was not an image; likely an SVG. " + "If you used style_id, make sure it is not a Vector art style." + ) -class RecraftColorRGBNode: - """ - Create Recraft Color by choosing specific RGB values. - """ - - RETURN_TYPES = (RecraftIO.COLOR,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - RETURN_NAMES = ("recraft_color",) - FUNCTION = "create_color" - CATEGORY = "api node/image/Recraft" +class RecraftColorRGBNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftColorRGB", + display_name="Recraft Color RGB", + category="api node/image/Recraft", + description="Create Recraft Color by choosing specific RGB values.", + inputs=[ + IO.Int.Input("r", default=0, min=0, max=255, tooltip="Red value of color."), + IO.Int.Input("g", default=0, min=0, max=255, tooltip="Green value of color."), + IO.Int.Input("b", default=0, min=0, max=255, tooltip="Blue value of color."), + IO.Custom(RecraftIO.COLOR).Input("recraft_color", optional=True), + ], + outputs=[ + IO.Custom(RecraftIO.COLOR).Output(display_name="recraft_color"), + ], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "r": (IO.INT, { - "default": 0, - "min": 0, - "max": 255, - "tooltip": "Red value of color." - }), - "g": (IO.INT, { - "default": 0, - "min": 0, - "max": 255, - "tooltip": "Green value of color." - }), - "b": (IO.INT, { - "default": 0, - "min": 0, - "max": 255, - "tooltip": "Blue value of color." - }), - }, - "optional": { - "recraft_color": (RecraftIO.COLOR,), - } - } - - def create_color(self, r: int, g: int, b: int, recraft_color: RecraftColorChain=None): + def execute(cls, r: int, g: int, b: int, recraft_color: RecraftColorChain = None) -> IO.NodeOutput: recraft_color = recraft_color.clone() if recraft_color else RecraftColorChain() recraft_color.add(RecraftColor(r, g, b)) - return (recraft_color, ) + return IO.NodeOutput(recraft_color) -class RecraftControlsNode: - """ - Create Recraft Controls for customizing Recraft generation. - """ - - RETURN_TYPES = (RecraftIO.CONTROLS,) - RETURN_NAMES = ("recraft_controls",) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "create_controls" - CATEGORY = "api node/image/Recraft" +class RecraftControlsNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftControls", + display_name="Recraft Controls", + category="api node/image/Recraft", + description="Create Recraft Controls for customizing Recraft generation.", + inputs=[ + IO.Custom(RecraftIO.COLOR).Input("colors", optional=True), + IO.Custom(RecraftIO.COLOR).Input("background_color", optional=True), + ], + outputs=[ + IO.Custom(RecraftIO.CONTROLS).Output(display_name="recraft_controls"), + ], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - }, - "optional": { - "colors": (RecraftIO.COLOR,), - "background_color": (RecraftIO.COLOR,), - } - } - - def create_controls(self, colors: RecraftColorChain=None, background_color: RecraftColorChain=None): - return (RecraftControls(colors=colors, background_color=background_color), ) + def execute(cls, colors: RecraftColorChain = None, background_color: RecraftColorChain = None) -> IO.NodeOutput: + return IO.NodeOutput(RecraftControls(colors=colors, background_color=background_color)) -class RecraftStyleV3RealisticImageNode: - """ - Select realistic_image style and optional substyle. - """ - - RETURN_TYPES = (RecraftIO.STYLEV3,) - RETURN_NAMES = ("recraft_style",) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "create_style" - CATEGORY = "api node/image/Recraft" - +class RecraftStyleV3RealisticImageNode(IO.ComfyNode): RECRAFT_STYLE = RecraftStyleV3.realistic_image @classmethod - def INPUT_TYPES(s): - return { - "required": { - "substyle": (get_v3_substyles(s.RECRAFT_STYLE),), - } - } + def define_schema(cls): + return IO.Schema( + node_id="RecraftStyleV3RealisticImage", + display_name="Recraft Style - Realistic Image", + category="api node/image/Recraft", + description="Select realistic_image style and optional substyle.", + inputs=[ + IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)), + ], + outputs=[ + IO.Custom(RecraftIO.STYLEV3).Output(display_name="recraft_style"), + ], + ) - def create_style(self, substyle: str): + @classmethod + def execute(cls, substyle: str) -> IO.NodeOutput: if substyle == "None": substyle = None - return (RecraftStyle(self.RECRAFT_STYLE, substyle),) + return IO.NodeOutput(RecraftStyle(cls.RECRAFT_STYLE, substyle)) class RecraftStyleV3DigitalIllustrationNode(RecraftStyleV3RealisticImageNode): - """ - Select digital_illustration style and optional substyle. - """ - RECRAFT_STYLE = RecraftStyleV3.digital_illustration + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftStyleV3DigitalIllustration", + display_name="Recraft Style - Digital Illustration", + category="api node/image/Recraft", + description="Select realistic_image style and optional substyle.", + inputs=[ + IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)), + ], + outputs=[ + IO.Custom(RecraftIO.STYLEV3).Output(display_name="recraft_style"), + ], + ) + class RecraftStyleV3VectorIllustrationNode(RecraftStyleV3RealisticImageNode): - """ - Select vector_illustration style and optional substyle. - """ - RECRAFT_STYLE = RecraftStyleV3.vector_illustration + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftStyleV3VectorIllustrationNode", + display_name="Recraft Style - Realistic Image", + category="api node/image/Recraft", + description="Select realistic_image style and optional substyle.", + inputs=[ + IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)), + ], + outputs=[ + IO.Custom(RecraftIO.STYLEV3).Output(display_name="recraft_style"), + ], + ) + class RecraftStyleV3LogoRasterNode(RecraftStyleV3RealisticImageNode): - """ - Select vector_illustration style and optional substyle. - """ - - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "substyle": (get_v3_substyles(s.RECRAFT_STYLE, include_none=False),), - } - } - RECRAFT_STYLE = RecraftStyleV3.logo_raster + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftStyleV3LogoRaster", + display_name="Recraft Style - Logo Raster", + category="api node/image/Recraft", + description="Select realistic_image style and optional substyle.", + inputs=[ + IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE, include_none=False)), + ], + outputs=[ + IO.Custom(RecraftIO.STYLEV3).Output(display_name="recraft_style"), + ], + ) -class RecraftStyleInfiniteStyleLibrary: - """ - Select style based on preexisting UUID from Recraft's Infinite Style Library. - """ - RETURN_TYPES = (RecraftIO.STYLEV3,) - RETURN_NAMES = ("recraft_style",) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "create_style" - CATEGORY = "api node/image/Recraft" +class RecraftStyleInfiniteStyleLibrary(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftStyleV3InfiniteStyleLibrary", + display_name="Recraft Style - Infinite Style Library", + category="api node/image/Recraft", + description="Select style based on preexisting UUID from Recraft's Infinite Style Library.", + inputs=[ + IO.String.Input("style_id", default="", tooltip="UUID of style from Infinite Style Library."), + ], + outputs=[ + IO.Custom(RecraftIO.STYLEV3).Output(display_name="recraft_style"), + ], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "style_id": (IO.STRING, { - "default": "", - "tooltip": "UUID of style from Infinite Style Library.", - }) - } - } - - def create_style(self, style_id: str): + def execute(cls, style_id: str) -> IO.NodeOutput: if not style_id: raise Exception("The style_id input cannot be empty.") - return (RecraftStyle(style_id=style_id),) + return IO.NodeOutput(RecraftStyle(style_id=style_id)) -class RecraftTextToImageNode: - """ - Generates images synchronously based on prompt and resolution. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftTextToImageNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftTextToImageNode", + display_name="Recraft Text to Image", + category="api node/image/Recraft", + description="Generates images synchronously based on prompt and resolution.", + inputs=[ + IO.String.Input("prompt", multiline=True, default="", tooltip="Prompt for the image generation."), + IO.Combo.Input( + "size", + options=[res.value for res in RecraftImageSize], + default=RecraftImageSize.res_1024x1024, + tooltip="The size of the generated image.", + ), + IO.Int.Input( + "n", + default=1, + min=1, + max=6, + tooltip="The number of images to generate.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.Custom(RecraftIO.STYLEV3).Input("recraft_style", optional=True), + IO.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + IO.Custom(RecraftIO.CONTROLS).Input( + "recraft_controls", + tooltip="Optional additional controls over the generation via the Recraft Controls node.", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation.", - }, - ), - "size": ( - [res.value for res in RecraftImageSize], - { - "default": RecraftImageSize.res_1024x1024, - "tooltip": "The size of the generated image.", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 6, - "tooltip": "The number of images to generate.", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "recraft_style": (RecraftIO.STYLEV3,), - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - "recraft_controls": ( - RecraftIO.CONTROLS, - { - "tooltip": "Optional additional controls over the generation via the Recraft Controls node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, size: str, n: int, @@ -418,9 +392,7 @@ class RecraftTextToImageNode: recraft_style: RecraftStyle = None, negative_prompt: str = None, recraft_controls: RecraftControls = None, - unique_id: Optional[str] = None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False, max_length=1000) default_style = RecraftStyle(RecraftStyleV3.realistic_image) if recraft_style is None: @@ -433,14 +405,11 @@ class RecraftTextToImageNode: if not negative_prompt: negative_prompt = None - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/recraft/image_generation", - method=HttpMethod.POST, - request_model=RecraftImageGenerationRequest, - response_model=RecraftImageGenerationResponse, - ), - request=RecraftImageGenerationRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/recraft/image_generation", method="POST"), + response_model=RecraftImageGenerationResponse, + data=RecraftImageGenerationRequest( prompt=prompt, negative_prompt=negative_prompt, model=RecraftModel.recraftv3, @@ -451,109 +420,83 @@ class RecraftTextToImageNode: style_id=recraft_style.style_id, controls=controls_api, ), - auth_kwargs=kwargs, + max_retries=1, ) - response: RecraftImageGenerationResponse = await operation.execute() images = [] - urls = [] for data in response.data: with handle_recraft_image_output(): - if unique_id and data.url: - urls.append(data.url) - urls_string = '\n'.join(urls) - PromptServer.instance.send_progress_text( - f"Result URL: {urls_string}", unique_id - ) - image = bytesio_to_image_tensor( - await download_url_to_bytesio(data.url, timeout=1024) - ) + image = bytesio_to_image_tensor(await download_url_as_bytesio(data.url, timeout=1024)) if len(image.shape) < 4: image = image.unsqueeze(0) images.append(image) - output_image = torch.cat(images, dim=0) - return (output_image,) + return IO.NodeOutput(torch.cat(images, dim=0)) -class RecraftImageToImageNode: - """ - Modify image based on prompt and strength. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftImageToImageNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftImageToImageNode", + display_name="Recraft Image to Image", + category="api node/image/Recraft", + description="Modify image based on prompt and strength.", + inputs=[ + IO.Image.Input("image"), + IO.String.Input("prompt", multiline=True, default="", tooltip="Prompt for the image generation."), + IO.Int.Input( + "n", + default=1, + min=1, + max=6, + tooltip="The number of images to generate.", + ), + IO.Float.Input( + "strength", + default=0.5, + min=0.0, + max=1.0, + step=0.01, + tooltip="Defines the difference with the original image, should lie in [0, 1], " + "where 0 means almost identical, and 1 means miserable similarity.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.Custom(RecraftIO.STYLEV3).Input("recraft_style", optional=True), + IO.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + IO.Custom(RecraftIO.CONTROLS).Input( + "recraft_controls", + tooltip="Optional additional controls over the generation via the Recraft Controls node.", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE, ), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation.", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 6, - "tooltip": "The number of images to generate.", - }, - ), - "strength": ( - IO.FLOAT, - { - "default": 0.5, - "min": 0.0, - "max": 1.0, - "step": 0.01, - "tooltip": "Defines the difference with the original image, should lie in [0, 1], where 0 means almost identical, and 1 means miserable similarity." - } - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "recraft_style": (RecraftIO.STYLEV3,), - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - "recraft_controls": ( - RecraftIO.CONTROLS, - { - "tooltip": "Optional additional controls over the generation via the Recraft Controls node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call( - self, + async def execute( + cls, image: torch.Tensor, prompt: str, n: int, @@ -562,8 +505,7 @@ class RecraftImageToImageNode: recraft_style: RecraftStyle = None, negative_prompt: str = None, recraft_controls: RecraftControls = None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False, max_length=1000) default_style = RecraftStyle(RecraftStyleV3.realistic_image) if recraft_style is None: @@ -593,83 +535,69 @@ class RecraftImageToImageNode: pbar = ProgressBar(total) for i in range(total): sub_bytes = await handle_recraft_file_request( + cls, image=image[i], path="/proxy/recraft/images/imageToImage", request=request, - auth_kwargs=kwargs, ) with handle_recraft_image_output(): images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0)) pbar.update(1) - images_tensor = torch.cat(images, dim=0) - return (images_tensor, ) + return IO.NodeOutput(torch.cat(images, dim=0)) -class RecraftImageInpaintingNode: - """ - Modify image based on prompt and mask. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftImageInpaintingNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftImageInpaintingNode", + display_name="Recraft Image Inpainting", + category="api node/image/Recraft", + description="Modify image based on prompt and mask.", + inputs=[ + IO.Image.Input("image"), + IO.Mask.Input("mask"), + IO.String.Input("prompt", multiline=True, default="", tooltip="Prompt for the image generation."), + IO.Int.Input( + "n", + default=1, + min=1, + max=6, + tooltip="The number of images to generate.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.Custom(RecraftIO.STYLEV3).Input("recraft_style", optional=True), + IO.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE, ), - "mask": (IO.MASK, ), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation.", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 6, - "tooltip": "The number of images to generate.", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "recraft_style": (RecraftIO.STYLEV3,), - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call( - self, + async def execute( + cls, image: torch.Tensor, mask: torch.Tensor, prompt: str, @@ -677,8 +605,7 @@ class RecraftImageInpaintingNode: seed, recraft_style: RecraftStyle = None, negative_prompt: str = None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False, max_length=1000) default_style = RecraftStyle(RecraftStyleV3.realistic_image) if recraft_style is None: @@ -705,96 +632,73 @@ class RecraftImageInpaintingNode: pbar = ProgressBar(total) for i in range(total): sub_bytes = await handle_recraft_file_request( + cls, image=image[i], - mask=mask[i:i+1], + mask=mask[i : i + 1], path="/proxy/recraft/images/inpaint", request=request, - auth_kwargs=kwargs, ) with handle_recraft_image_output(): images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0)) pbar.update(1) - images_tensor = torch.cat(images, dim=0) - return (images_tensor, ) + return IO.NodeOutput(torch.cat(images, dim=0)) -class RecraftTextToVectorNode: - """ - Generates SVG synchronously based on prompt and resolution. - """ - - RETURN_TYPES = ("SVG",) # Changed - DESCRIPTION = cleandoc(__doc__ or "") if 'cleandoc' in globals() else __doc__ # Keep cleandoc if other nodes use it - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftTextToVectorNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftTextToVectorNode", + display_name="Recraft Text to Vector", + category="api node/image/Recraft", + description="Generates SVG synchronously based on prompt and resolution.", + inputs=[ + IO.String.Input("prompt", default="", tooltip="Prompt for the image generation.", multiline=True), + IO.Combo.Input("substyle", options=get_v3_substyles(RecraftStyleV3.vector_illustration)), + IO.Combo.Input( + "size", + options=[res.value for res in RecraftImageSize], + default=RecraftImageSize.res_1024x1024, + tooltip="The size of the generated image.", + ), + IO.Int.Input("n", default=1, min=1, max=6, tooltip="The number of images to generate."), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + IO.Custom(RecraftIO.CONTROLS).Input( + "recraft_controls", + tooltip="Optional additional controls over the generation via the Recraft Controls node.", + optional=True, + ), + ], + outputs=[ + IO.SVG.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation.", - }, - ), - "substyle": (get_v3_substyles(RecraftStyleV3.vector_illustration),), - "size": ( - [res.value for res in RecraftImageSize], - { - "default": RecraftImageSize.res_1024x1024, - "tooltip": "The size of the generated image.", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 6, - "tooltip": "The number of images to generate.", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - "recraft_controls": ( - RecraftIO.CONTROLS, - { - "tooltip": "Optional additional controls over the generation via the Recraft Controls node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, substyle: str, size: str, @@ -802,9 +706,7 @@ class RecraftTextToVectorNode: seed, negative_prompt: str = None, recraft_controls: RecraftControls = None, - unique_id: Optional[str] = None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False, max_length=1000) # create RecraftStyle so strings will be formatted properly (i.e. "None" will become None) recraft_style = RecraftStyle(RecraftStyleV3.vector_illustration, substyle=substyle) @@ -816,14 +718,11 @@ class RecraftTextToVectorNode: if not negative_prompt: negative_prompt = None - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/recraft/image_generation", - method=HttpMethod.POST, - request_model=RecraftImageGenerationRequest, - response_model=RecraftImageGenerationResponse, - ), - request=RecraftImageGenerationRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/recraft/image_generation", method="POST"), + response_model=RecraftImageGenerationResponse, + data=RecraftImageGenerationRequest( prompt=prompt, negative_prompt=negative_prompt, model=RecraftModel.recraftv3, @@ -833,139 +732,105 @@ class RecraftTextToVectorNode: substyle=recraft_style.substyle, controls=controls_api, ), - auth_kwargs=kwargs, + max_retries=1, ) - response: RecraftImageGenerationResponse = await operation.execute() svg_data = [] - urls = [] for data in response.data: - if unique_id and data.url: - urls.append(data.url) - # Print result on each iteration in case of error - PromptServer.instance.send_progress_text( - f"Result URL: {' '.join(urls)}", unique_id - ) - svg_data.append(await download_url_to_bytesio(data.url, timeout=1024)) + svg_data.append(await download_url_as_bytesio(data.url, timeout=1024)) - return (SVG(svg_data),) + return IO.NodeOutput(SVG(svg_data)) -class RecraftVectorizeImageNode: - """ - Generates SVG synchronously from an input image. - """ - - RETURN_TYPES = ("SVG",) # Changed - DESCRIPTION = cleandoc(__doc__ or "") if 'cleandoc' in globals() else __doc__ # Keep cleandoc if other nodes use it - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftVectorizeImageNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftVectorizeImageNode", + display_name="Recraft Vectorize Image", + category="api node/image/Recraft", + description="Generates SVG synchronously from an input image.", + inputs=[ + IO.Image.Input("image"), + ], + outputs=[ + IO.SVG.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE, ), - }, - "optional": { - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call( - self, - image: torch.Tensor, - **kwargs, - ): + async def execute(cls, image: torch.Tensor) -> IO.NodeOutput: svgs = [] total = image.shape[0] pbar = ProgressBar(total) for i in range(total): sub_bytes = await handle_recraft_file_request( + cls, image=image[i], path="/proxy/recraft/images/vectorize", - auth_kwargs=kwargs, ) svgs.append(SVG(sub_bytes)) pbar.update(1) - return (SVG.combine_all(svgs), ) + return IO.NodeOutput(SVG.combine_all(svgs)) -class RecraftReplaceBackgroundNode: - """ - Replace background on image, based on provided prompt. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftReplaceBackgroundNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftReplaceBackgroundNode", + display_name="Recraft Replace Background", + category="api node/image/Recraft", + description="Replace background on image, based on provided prompt.", + inputs=[ + IO.Image.Input("image"), + IO.String.Input("prompt", tooltip="Prompt for the image generation.", default="", multiline=True), + IO.Int.Input("n", default=1, min=1, max=6, tooltip="The number of images to generate."), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.Custom(RecraftIO.STYLEV3).Input("recraft_style", optional=True), + IO.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE, ), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation.", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 6, - "tooltip": "The number of images to generate.", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "recraft_style": (RecraftIO.STYLEV3,), - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call( - self, + async def execute( + cls, image: torch.Tensor, prompt: str, n: int, seed, recraft_style: RecraftStyle = None, negative_prompt: str = None, - **kwargs, - ): + ) -> IO.NodeOutput: default_style = RecraftStyle(RecraftStyleV3.realistic_image) if recraft_style is None: recraft_style = default_style @@ -988,165 +853,151 @@ class RecraftReplaceBackgroundNode: pbar = ProgressBar(total) for i in range(total): sub_bytes = await handle_recraft_file_request( + cls, image=image[i], path="/proxy/recraft/images/replaceBackground", request=request, - auth_kwargs=kwargs, ) images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0)) pbar.update(1) - images_tensor = torch.cat(images, dim=0) - return (images_tensor, ) + return IO.NodeOutput(torch.cat(images, dim=0)) -class RecraftRemoveBackgroundNode: - """ - Remove background from image, and return processed image and mask. - """ - - RETURN_TYPES = (IO.IMAGE, IO.MASK) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftRemoveBackgroundNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftRemoveBackgroundNode", + display_name="Recraft Remove Background", + category="api node/image/Recraft", + description="Remove background from image, and return processed image and mask.", + inputs=[ + IO.Image.Input("image"), + ], + outputs=[ + IO.Image.Output(), + IO.Mask.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE, ), - }, - "optional": { - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call( - self, - image: torch.Tensor, - **kwargs, - ): + async def execute(cls, image: torch.Tensor) -> IO.NodeOutput: images = [] total = image.shape[0] pbar = ProgressBar(total) for i in range(total): sub_bytes = await handle_recraft_file_request( + cls, image=image[i], path="/proxy/recraft/images/removeBackground", - auth_kwargs=kwargs, ) images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0)) pbar.update(1) images_tensor = torch.cat(images, dim=0) # use alpha channel as masks, in B,H,W format - masks_tensor = images_tensor[:,:,:,-1:].squeeze(-1) - return (images_tensor, masks_tensor) + masks_tensor = images_tensor[:, :, :, -1:].squeeze(-1) + return IO.NodeOutput(images_tensor, masks_tensor) -class RecraftCrispUpscaleNode: - """ - Upscale image synchronously. - Enhances a given raster image using ‘crisp upscale’ tool, increasing image resolution, making the image sharper and cleaner. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" - +class RecraftCrispUpscaleNode(IO.ComfyNode): RECRAFT_PATH = "/proxy/recraft/images/crispUpscale" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE, ), - }, - "optional": { - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="RecraftCrispUpscaleNode", + display_name="Recraft Crisp Upscale Image", + category="api node/image/Recraft", + description="Upscale image synchronously.\n" + "Enhances a given raster image using ‘crisp upscale’ tool, " + "increasing image resolution, making the image sharper and cleaner.", + inputs=[ + IO.Image.Input("image"), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - async def api_call( - self, - image: torch.Tensor, - **kwargs, - ): + @classmethod + async def execute(cls, image: torch.Tensor) -> IO.NodeOutput: images = [] total = image.shape[0] pbar = ProgressBar(total) for i in range(total): sub_bytes = await handle_recraft_file_request( + cls, image=image[i], - path=self.RECRAFT_PATH, - auth_kwargs=kwargs, + path=cls.RECRAFT_PATH, ) images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0)) pbar.update(1) - images_tensor = torch.cat(images, dim=0) - return (images_tensor,) + return IO.NodeOutput(torch.cat(images, dim=0)) class RecraftCreativeUpscaleNode(RecraftCrispUpscaleNode): - """ - Upscale image synchronously. - Enhances a given raster image using ‘creative upscale’ tool, boosting resolution with a focus on refining small details and faces. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" - RECRAFT_PATH = "/proxy/recraft/images/creativeUpscale" + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftCreativeUpscaleNode", + display_name="Recraft Creative Upscale Image", + category="api node/image/Recraft", + description="Upscale image synchronously.\n" + "Enhances a given raster image using ‘creative upscale’ tool, " + "boosting resolution with a focus on refining small details and faces.", + inputs=[ + IO.Image.Input("image"), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "RecraftTextToImageNode": RecraftTextToImageNode, - "RecraftImageToImageNode": RecraftImageToImageNode, - "RecraftImageInpaintingNode": RecraftImageInpaintingNode, - "RecraftTextToVectorNode": RecraftTextToVectorNode, - "RecraftVectorizeImageNode": RecraftVectorizeImageNode, - "RecraftRemoveBackgroundNode": RecraftRemoveBackgroundNode, - "RecraftReplaceBackgroundNode": RecraftReplaceBackgroundNode, - "RecraftCrispUpscaleNode": RecraftCrispUpscaleNode, - "RecraftCreativeUpscaleNode": RecraftCreativeUpscaleNode, - "RecraftStyleV3RealisticImage": RecraftStyleV3RealisticImageNode, - "RecraftStyleV3DigitalIllustration": RecraftStyleV3DigitalIllustrationNode, - "RecraftStyleV3LogoRaster": RecraftStyleV3LogoRasterNode, - "RecraftStyleV3InfiniteStyleLibrary": RecraftStyleInfiniteStyleLibrary, - "RecraftColorRGB": RecraftColorRGBNode, - "RecraftControls": RecraftControlsNode, -} -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "RecraftTextToImageNode": "Recraft Text to Image", - "RecraftImageToImageNode": "Recraft Image to Image", - "RecraftImageInpaintingNode": "Recraft Image Inpainting", - "RecraftTextToVectorNode": "Recraft Text to Vector", - "RecraftVectorizeImageNode": "Recraft Vectorize Image", - "RecraftRemoveBackgroundNode": "Recraft Remove Background", - "RecraftReplaceBackgroundNode": "Recraft Replace Background", - "RecraftCrispUpscaleNode": "Recraft Crisp Upscale Image", - "RecraftCreativeUpscaleNode": "Recraft Creative Upscale Image", - "RecraftStyleV3RealisticImage": "Recraft Style - Realistic Image", - "RecraftStyleV3DigitalIllustration": "Recraft Style - Digital Illustration", - "RecraftStyleV3LogoRaster": "Recraft Style - Logo Raster", - "RecraftStyleV3InfiniteStyleLibrary": "Recraft Style - Infinite Style Library", - "RecraftColorRGB": "Recraft Color RGB", - "RecraftControls": "Recraft Controls", -} +class RecraftExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + RecraftTextToImageNode, + RecraftImageToImageNode, + RecraftImageInpaintingNode, + RecraftTextToVectorNode, + RecraftVectorizeImageNode, + RecraftRemoveBackgroundNode, + RecraftReplaceBackgroundNode, + RecraftCrispUpscaleNode, + RecraftCreativeUpscaleNode, + RecraftStyleV3RealisticImageNode, + RecraftStyleV3DigitalIllustrationNode, + RecraftStyleV3LogoRasterNode, + RecraftStyleInfiniteStyleLibrary, + RecraftColorRGBNode, + RecraftControlsNode, + ] + + +async def comfy_entrypoint() -> RecraftExtension: + return RecraftExtension() From 3fa7a5c04ae69ad168a875e8d3d453783d60899d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 28 Oct 2025 21:21:01 -0700 Subject: [PATCH 505/759] Speed up offloading using pinned memory. (#10526) To enable this feature use: --fast pinned_memory --- comfy/cli_args.py | 1 + comfy/model_management.py | 30 ++++++++++++++++++++++++++++++ comfy/model_patcher.py | 26 +++++++++++++++++++++++++- 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index cc1f12482..001abd843 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -144,6 +144,7 @@ class PerformanceFeature(enum.Enum): Fp8MatrixMultiplication = "fp8_matrix_mult" CublasOps = "cublas_ops" AutoTune = "autotune" + PinnedMem = "pinned_memory" parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature)))) diff --git a/comfy/model_management.py b/comfy/model_management.py index afe78f36e..3e5b977d4 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1080,6 +1080,36 @@ def cast_to_device(tensor, device, dtype, copy=False): non_blocking = device_supports_non_blocking(device) return cast_to(tensor, dtype=dtype, device=device, non_blocking=non_blocking, copy=copy) +def pin_memory(tensor): + if PerformanceFeature.PinnedMem not in args.fast: + return False + + if not is_nvidia(): + return False + + if not is_device_cpu(tensor.device): + return False + + if torch.cuda.cudart().cudaHostRegister(tensor.data_ptr(), tensor.numel() * tensor.element_size(), 1) == 0: + return True + + return False + +def unpin_memory(tensor): + if PerformanceFeature.PinnedMem not in args.fast: + return False + + if not is_nvidia(): + return False + + if not is_device_cpu(tensor.device): + return False + + if torch.cuda.cudart().cudaHostUnregister(tensor.data_ptr()) == 0: + return True + + return False + def sage_attention_enabled(): return args.use_sage_attention diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index c0b68fb8c..aec73349c 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -238,6 +238,7 @@ class ModelPatcher: self.force_cast_weights = False self.patches_uuid = uuid.uuid4() self.parent = None + self.pinned = set() self.attachments: dict[str] = {} self.additional_models: dict[str, list[ModelPatcher]] = {} @@ -618,6 +619,21 @@ class ModelPatcher: else: set_func(out_weight, inplace_update=inplace_update, seed=string_to_seed(key)) + def pin_weight_to_device(self, key): + weight, set_func, convert_func = get_key_weight(self.model, key) + if comfy.model_management.pin_memory(weight): + self.pinned.add(key) + + def unpin_weight(self, key): + if key in self.pinned: + weight, set_func, convert_func = get_key_weight(self.model, key) + comfy.model_management.unpin_memory(weight) + self.pinned.remove(key) + + def unpin_all_weights(self): + for key in list(self.pinned): + self.unpin_weight(key) + def _load_list(self): loading = [] for n, m in self.model.named_modules(): @@ -683,6 +699,8 @@ class ModelPatcher: patch_counter += 1 cast_weight = True + for param in params: + self.pin_weight_to_device("{}.{}".format(n, param)) else: if hasattr(m, "comfy_cast_weights"): wipe_lowvram_weight(m) @@ -713,7 +731,9 @@ class ModelPatcher: continue for param in params: - self.patch_weight_to_device("{}.{}".format(n, param), device_to=device_to) + key = "{}.{}".format(n, param) + self.unpin_weight(key) + self.patch_weight_to_device(key, device_to=device_to) logging.debug("lowvram: loaded module regularly {} {}".format(n, m)) m.comfy_patched_weights = True @@ -762,6 +782,7 @@ class ModelPatcher: self.eject_model() if unpatch_weights: self.unpatch_hooks() + self.unpin_all_weights() if self.model.model_lowvram: for m in self.model.modules(): move_weight_functions(m, device_to) @@ -857,6 +878,9 @@ class ModelPatcher: memory_freed += module_mem logging.debug("freed {}".format(n)) + for param in params: + self.pin_weight_to_device("{}.{}".format(n, param)) + self.model.model_lowvram = True self.model.lowvram_patch_counter += patch_counter self.model.model_loaded_weight_memory -= memory_freed From e525673f7201b6c49af0fa0e6baf44e4e98bb10c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 28 Oct 2025 21:37:00 -0700 Subject: [PATCH 506/759] Fix issue. (#10527) --- comfy/sd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/sd.py b/comfy/sd.py index 6411bb27d..de4eee96e 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1330,7 +1330,7 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None): else: unet_dtype = dtype - if hasattr(model_config, "layer_quant_config"): + if model_config.layer_quant_config is not None: manual_cast_dtype = model_management.unet_manual_cast(None, load_device, model_config.supported_inference_dtypes) else: manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes) From 6c14f3afac0ea28dba24fe8783e7c1f09c03b31f Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 29 Oct 2025 20:14:56 +0200 Subject: [PATCH 507/759] use new API client in Luma and Minimax nodes (#10528) --- comfy_api_nodes/apinode_utils.py | 81 ------ comfy_api_nodes/apis/minimax_api.py | 120 ++++++++ comfy_api_nodes/nodes_ideogram.py | 2 +- comfy_api_nodes/nodes_luma.py | 354 ++++++----------------- comfy_api_nodes/nodes_minimax.py | 237 +++++---------- comfy_api_nodes/util/client.py | 2 +- comfy_api_nodes/util/download_helpers.py | 3 +- 7 files changed, 283 insertions(+), 516 deletions(-) create mode 100644 comfy_api_nodes/apis/minimax_api.py diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py index 4182c8f80..6a72b9d1d 100644 --- a/comfy_api_nodes/apinode_utils.py +++ b/comfy_api_nodes/apinode_utils.py @@ -3,14 +3,6 @@ import aiohttp import mimetypes from typing import Optional, Union from comfy.utils import common_upscale -from comfy_api_nodes.apis.client import ( - ApiClient, - ApiEndpoint, - HttpMethod, - SynchronousOperation, - UploadRequest, - UploadResponse, -) from server import PromptServer from comfy.cli_args import args @@ -19,7 +11,6 @@ from PIL import Image import torch import math import base64 -from .util import tensor_to_bytesio, bytesio_to_image_tensor from io import BytesIO @@ -148,11 +139,6 @@ async def download_url_to_bytesio( return BytesIO(await resp.read()) -def process_image_response(response_content: bytes | str) -> torch.Tensor: - """Uses content from a Response object and converts it to a torch.Tensor""" - return bytesio_to_image_tensor(BytesIO(response_content)) - - def text_filepath_to_base64_string(filepath: str) -> str: """Converts a text file to a base64 string.""" with open(filepath, "rb") as f: @@ -169,73 +155,6 @@ def text_filepath_to_data_uri(filepath: str) -> str: return f"data:{mime_type};base64,{base64_string}" -async def upload_file_to_comfyapi( - file_bytes_io: BytesIO, - filename: str, - upload_mime_type: Optional[str], - auth_kwargs: Optional[dict[str, str]] = None, -) -> str: - """ - Uploads a single file to ComfyUI API and returns its download URL. - - Args: - file_bytes_io: BytesIO object containing the file data. - filename: The filename of the file. - upload_mime_type: MIME type of the file. - auth_kwargs: Optional authentication token(s). - - Returns: - The download URL for the uploaded file. - """ - if upload_mime_type is None: - request_object = UploadRequest(file_name=filename) - else: - request_object = UploadRequest(file_name=filename, content_type=upload_mime_type) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/customers/storage", - method=HttpMethod.POST, - request_model=UploadRequest, - response_model=UploadResponse, - ), - request=request_object, - auth_kwargs=auth_kwargs, - ) - - response: UploadResponse = await operation.execute() - await ApiClient.upload_file(response.upload_url, file_bytes_io, content_type=upload_mime_type) - return response.download_url - - -async def upload_images_to_comfyapi( - image: torch.Tensor, - max_images=8, - auth_kwargs: Optional[dict[str, str]] = None, - mime_type: Optional[str] = None, -) -> list[str]: - """ - Uploads images to ComfyUI API and returns download URLs. - To upload multiple images, stack them in the batch dimension first. - - Args: - image: Input torch.Tensor image. - max_images: Maximum number of images to upload. - auth_kwargs: Optional authentication token(s). - mime_type: Optional MIME type for the image. - """ - # if batch, try to upload each file if max_images is greater than 0 - download_urls: list[str] = [] - is_batch = len(image.shape) > 3 - batch_len = image.shape[0] if is_batch else 1 - - for idx in range(min(batch_len, max_images)): - tensor = image[idx] if is_batch else image - img_io = tensor_to_bytesio(tensor, mime_type=mime_type) - url = await upload_file_to_comfyapi(img_io, img_io.name, mime_type, auth_kwargs) - download_urls.append(url) - return download_urls - - def resize_mask_to_image( mask: torch.Tensor, image: torch.Tensor, diff --git a/comfy_api_nodes/apis/minimax_api.py b/comfy_api_nodes/apis/minimax_api.py new file mode 100644 index 000000000..d747e177a --- /dev/null +++ b/comfy_api_nodes/apis/minimax_api.py @@ -0,0 +1,120 @@ +from enum import Enum +from typing import Optional + +from pydantic import BaseModel, Field + + +class MinimaxBaseResponse(BaseModel): + status_code: int = Field( + ..., + description='Status code. 0 indicates success, other values indicate errors.', + ) + status_msg: str = Field( + ..., description='Specific error details or success message.' + ) + + +class File(BaseModel): + bytes: Optional[int] = Field(None, description='File size in bytes') + created_at: Optional[int] = Field( + None, description='Unix timestamp when the file was created, in seconds' + ) + download_url: Optional[str] = Field( + None, description='The URL to download the video' + ) + backup_download_url: Optional[str] = Field( + None, description='The backup URL to download the video' + ) + + file_id: Optional[int] = Field(None, description='Unique identifier for the file') + filename: Optional[str] = Field(None, description='The name of the file') + purpose: Optional[str] = Field(None, description='The purpose of using the file') + + +class MinimaxFileRetrieveResponse(BaseModel): + base_resp: MinimaxBaseResponse + file: File + + +class MiniMaxModel(str, Enum): + T2V_01_Director = 'T2V-01-Director' + I2V_01_Director = 'I2V-01-Director' + S2V_01 = 'S2V-01' + I2V_01 = 'I2V-01' + I2V_01_live = 'I2V-01-live' + T2V_01 = 'T2V-01' + Hailuo_02 = 'MiniMax-Hailuo-02' + + +class Status6(str, Enum): + Queueing = 'Queueing' + Preparing = 'Preparing' + Processing = 'Processing' + Success = 'Success' + Fail = 'Fail' + + +class MinimaxTaskResultResponse(BaseModel): + base_resp: MinimaxBaseResponse + file_id: Optional[str] = Field( + None, + description='After the task status changes to Success, this field returns the file ID corresponding to the generated video.', + ) + status: Status6 = Field( + ..., + description="Task status: 'Queueing' (in queue), 'Preparing' (task is preparing), 'Processing' (generating), 'Success' (task completed successfully), or 'Fail' (task failed).", + ) + task_id: str = Field(..., description='The task ID being queried.') + + +class SubjectReferenceItem(BaseModel): + image: Optional[str] = Field( + None, description='URL or base64 encoding of the subject reference image.' + ) + mask: Optional[str] = Field( + None, + description='URL or base64 encoding of the mask for the subject reference image.', + ) + + +class MinimaxVideoGenerationRequest(BaseModel): + callback_url: Optional[str] = Field( + None, + description='Optional. URL to receive real-time status updates about the video generation task.', + ) + first_frame_image: Optional[str] = Field( + None, + description='URL or base64 encoding of the first frame image. Required when model is I2V-01, I2V-01-Director, or I2V-01-live.', + ) + model: MiniMaxModel = Field( + ..., + description='Required. ID of model. Options: T2V-01-Director, I2V-01-Director, S2V-01, I2V-01, I2V-01-live, T2V-01', + ) + prompt: Optional[str] = Field( + None, + description='Description of the video. Should be less than 2000 characters. Supports camera movement instructions in [brackets].', + max_length=2000, + ) + prompt_optimizer: Optional[bool] = Field( + True, + description='If true (default), the model will automatically optimize the prompt. Set to false for more precise control.', + ) + subject_reference: Optional[list[SubjectReferenceItem]] = Field( + None, + description='Only available when model is S2V-01. The model will generate a video based on the subject uploaded through this parameter.', + ) + duration: Optional[int] = Field( + None, + description="The length of the output video in seconds." + ) + resolution: Optional[str] = Field( + None, + description="The dimensions of the video display. 1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels." + ) + + +class MinimaxVideoGenerationResponse(BaseModel): + base_resp: MinimaxBaseResponse + task_id: str = Field( + ..., description='The task ID for the asynchronous video generation task.' + ) diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index 9eae5f11a..d8fd3378b 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -20,9 +20,9 @@ from comfy_api_nodes.apis.client import ( from comfy_api_nodes.apinode_utils import ( download_url_to_bytesio, - bytesio_to_image_tensor, resize_mask_to_image, ) +from comfy_api_nodes.util import bytesio_to_image_tensor from server import PromptServer V1_V1_RES_MAP = { diff --git a/comfy_api_nodes/nodes_luma.py b/comfy_api_nodes/nodes_luma.py index e74441e5e..894f2b08c 100644 --- a/comfy_api_nodes/nodes_luma.py +++ b/comfy_api_nodes/nodes_luma.py @@ -1,69 +1,51 @@ -from __future__ import annotations -from inspect import cleandoc from typing import Optional + +import torch from typing_extensions import override -from comfy_api.latest import ComfyExtension, IO -from comfy_api.input_impl.video_types import VideoFromFile + +from comfy_api.latest import IO, ComfyExtension from comfy_api_nodes.apis.luma_api import ( - LumaImageModel, - LumaVideoModel, - LumaVideoOutputResolution, - LumaVideoModelOutputDuration, LumaAspectRatio, - LumaState, - LumaImageGenerationRequest, - LumaGenerationRequest, - LumaGeneration, LumaCharacterRef, - LumaModifyImageRef, + LumaConceptChain, + LumaGeneration, + LumaGenerationRequest, + LumaImageGenerationRequest, LumaImageIdentity, + LumaImageModel, + LumaImageReference, + LumaIO, + LumaKeyframes, + LumaModifyImageRef, LumaReference, LumaReferenceChain, - LumaImageReference, - LumaKeyframes, - LumaConceptChain, - LumaIO, + LumaVideoModel, + LumaVideoModelOutputDuration, + LumaVideoOutputResolution, get_luma_concepts, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( + download_url_to_image_tensor, + download_url_to_video_output, + poll_op, + sync_op, upload_images_to_comfyapi, - process_image_response, + validate_string, ) -from server import PromptServer -from comfy_api_nodes.util import validate_string - -import aiohttp -import torch -from io import BytesIO LUMA_T2V_AVERAGE_DURATION = 105 LUMA_I2V_AVERAGE_DURATION = 100 -def image_result_url_extractor(response: LumaGeneration): - return response.assets.image if hasattr(response, "assets") and hasattr(response.assets, "image") else None - -def video_result_url_extractor(response: LumaGeneration): - return response.assets.video if hasattr(response, "assets") and hasattr(response.assets, "video") else None class LumaReferenceNode(IO.ComfyNode): - """ - Holds an image and weight for use with Luma Generate Image node. - """ - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="LumaReferenceNode", display_name="Luma Reference", category="api node/image/Luma", - description=cleandoc(cls.__doc__ or ""), + description="Holds an image and weight for use with Luma Generate Image node.", inputs=[ IO.Image.Input( "image", @@ -83,17 +65,10 @@ class LumaReferenceNode(IO.ComfyNode): ), ], outputs=[IO.Custom(LumaIO.LUMA_REF).Output(display_name="luma_ref")], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], ) @classmethod - def execute( - cls, image: torch.Tensor, weight: float, luma_ref: LumaReferenceChain = None - ) -> IO.NodeOutput: + def execute(cls, image: torch.Tensor, weight: float, luma_ref: LumaReferenceChain = None) -> IO.NodeOutput: if luma_ref is not None: luma_ref = luma_ref.clone() else: @@ -103,17 +78,13 @@ class LumaReferenceNode(IO.ComfyNode): class LumaConceptsNode(IO.ComfyNode): - """ - Holds one or more Camera Concepts for use with Luma Text to Video and Luma Image to Video nodes. - """ - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="LumaConceptsNode", display_name="Luma Concepts", category="api node/video/Luma", - description=cleandoc(cls.__doc__ or ""), + description="Camera Concepts for use with Luma Text to Video and Luma Image to Video nodes.", inputs=[ IO.Combo.Input( "concept1", @@ -138,11 +109,6 @@ class LumaConceptsNode(IO.ComfyNode): ), ], outputs=[IO.Custom(LumaIO.LUMA_CONCEPTS).Output(display_name="luma_concepts")], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], ) @classmethod @@ -161,17 +127,13 @@ class LumaConceptsNode(IO.ComfyNode): class LumaImageGenerationNode(IO.ComfyNode): - """ - Generates images synchronously based on prompt and aspect ratio. - """ - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="LumaImageNode", display_name="Luma Text to Image", category="api node/image/Luma", - description=cleandoc(cls.__doc__ or ""), + description="Generates images synchronously based on prompt and aspect ratio.", inputs=[ IO.String.Input( "prompt", @@ -237,45 +199,30 @@ class LumaImageGenerationNode(IO.ComfyNode): aspect_ratio: str, seed, style_image_weight: float, - image_luma_ref: LumaReferenceChain = None, - style_image: torch.Tensor = None, - character_image: torch.Tensor = None, + image_luma_ref: Optional[LumaReferenceChain] = None, + style_image: Optional[torch.Tensor] = None, + character_image: Optional[torch.Tensor] = None, ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=3) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } # handle image_luma_ref api_image_ref = None if image_luma_ref is not None: - api_image_ref = await cls._convert_luma_refs( - image_luma_ref, max_refs=4, auth_kwargs=auth_kwargs, - ) + api_image_ref = await cls._convert_luma_refs(image_luma_ref, max_refs=4) # handle style_luma_ref api_style_ref = None if style_image is not None: - api_style_ref = await cls._convert_style_image( - style_image, weight=style_image_weight, auth_kwargs=auth_kwargs, - ) + api_style_ref = await cls._convert_style_image(style_image, weight=style_image_weight) # handle character_ref images character_ref = None if character_image is not None: - download_urls = await upload_images_to_comfyapi( - character_image, max_images=4, auth_kwargs=auth_kwargs, - ) - character_ref = LumaCharacterRef( - identity0=LumaImageIdentity(images=download_urls) - ) + download_urls = await upload_images_to_comfyapi(cls, character_image, max_images=4) + character_ref = LumaCharacterRef(identity0=LumaImageIdentity(images=download_urls)) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/luma/generations/image", - method=HttpMethod.POST, - request_model=LumaImageGenerationRequest, - response_model=LumaGeneration, - ), - request=LumaImageGenerationRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/luma/generations/image", method="POST"), + response_model=LumaGeneration, + data=LumaImageGenerationRequest( prompt=prompt, model=model, aspect_ratio=aspect_ratio, @@ -283,41 +230,21 @@ class LumaImageGenerationNode(IO.ComfyNode): style_ref=api_style_ref, character_ref=character_ref, ), - auth_kwargs=auth_kwargs, ) - response_api: LumaGeneration = await operation.execute() - - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/luma/generations/{response_api.id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=LumaGeneration, - ), - completed_statuses=[LumaState.completed], - failed_statuses=[LumaState.failed], + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/luma/generations/{response_api.id}"), + response_model=LumaGeneration, status_extractor=lambda x: x.state, - result_url_extractor=image_result_url_extractor, - node_id=cls.hidden.unique_id, - auth_kwargs=auth_kwargs, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.assets.image) as img_response: - img = process_image_response(await img_response.content.read()) - return IO.NodeOutput(img) + return IO.NodeOutput(await download_url_to_image_tensor(response_poll.assets.image)) @classmethod - async def _convert_luma_refs( - cls, luma_ref: LumaReferenceChain, max_refs: int, auth_kwargs: Optional[dict[str,str]] = None - ): + async def _convert_luma_refs(cls, luma_ref: LumaReferenceChain, max_refs: int): luma_urls = [] ref_count = 0 for ref in luma_ref.refs: - download_urls = await upload_images_to_comfyapi( - ref.image, max_images=1, auth_kwargs=auth_kwargs - ) + download_urls = await upload_images_to_comfyapi(cls, ref.image, max_images=1) luma_urls.append(download_urls[0]) ref_count += 1 if ref_count >= max_refs: @@ -325,27 +252,19 @@ class LumaImageGenerationNode(IO.ComfyNode): return luma_ref.create_api_model(download_urls=luma_urls, max_refs=max_refs) @classmethod - async def _convert_style_image( - cls, style_image: torch.Tensor, weight: float, auth_kwargs: Optional[dict[str,str]] = None - ): - chain = LumaReferenceChain( - first_ref=LumaReference(image=style_image, weight=weight) - ) - return await cls._convert_luma_refs(chain, max_refs=1, auth_kwargs=auth_kwargs) + async def _convert_style_image(cls, style_image: torch.Tensor, weight: float): + chain = LumaReferenceChain(first_ref=LumaReference(image=style_image, weight=weight)) + return await cls._convert_luma_refs(chain, max_refs=1) class LumaImageModifyNode(IO.ComfyNode): - """ - Modifies images synchronously based on prompt and aspect ratio. - """ - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="LumaImageModifyNode", display_name="Luma Image to Image", category="api node/image/Luma", - description=cleandoc(cls.__doc__ or ""), + description="Modifies images synchronously based on prompt and aspect ratio.", inputs=[ IO.Image.Input( "image", @@ -395,68 +314,37 @@ class LumaImageModifyNode(IO.ComfyNode): image_weight: float, seed, ) -> IO.NodeOutput: - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - # first, upload image - download_urls = await upload_images_to_comfyapi( - image, max_images=1, auth_kwargs=auth_kwargs, - ) + download_urls = await upload_images_to_comfyapi(cls, image, max_images=1) image_url = download_urls[0] - # next, make Luma call with download url provided - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/luma/generations/image", - method=HttpMethod.POST, - request_model=LumaImageGenerationRequest, - response_model=LumaGeneration, - ), - request=LumaImageGenerationRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/luma/generations/image", method="POST"), + response_model=LumaGeneration, + data=LumaImageGenerationRequest( prompt=prompt, model=model, modify_image_ref=LumaModifyImageRef( - url=image_url, weight=round(max(min(1.0-image_weight, 0.98), 0.0), 2) + url=image_url, weight=round(max(min(1.0 - image_weight, 0.98), 0.0), 2) ), ), - auth_kwargs=auth_kwargs, ) - response_api: LumaGeneration = await operation.execute() - - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/luma/generations/{response_api.id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=LumaGeneration, - ), - completed_statuses=[LumaState.completed], - failed_statuses=[LumaState.failed], + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/luma/generations/{response_api.id}"), + response_model=LumaGeneration, status_extractor=lambda x: x.state, - result_url_extractor=image_result_url_extractor, - node_id=cls.hidden.unique_id, - auth_kwargs=auth_kwargs, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.assets.image) as img_response: - img = process_image_response(await img_response.content.read()) - return IO.NodeOutput(img) + return IO.NodeOutput(await download_url_to_image_tensor(response_poll.assets.image)) class LumaTextToVideoGenerationNode(IO.ComfyNode): - """ - Generates videos synchronously based on prompt and output_size. - """ - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="LumaVideoNode", display_name="Luma Text to Video", category="api node/video/Luma", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos synchronously based on prompt and output_size.", inputs=[ IO.String.Input( "prompt", @@ -498,7 +386,7 @@ class LumaTextToVideoGenerationNode(IO.ComfyNode): "luma_concepts", tooltip="Optional Camera Concepts to dictate camera motion via the Luma Concepts node.", optional=True, - ) + ), ], outputs=[IO.Video.Output()], hidden=[ @@ -519,24 +407,17 @@ class LumaTextToVideoGenerationNode(IO.ComfyNode): duration: str, loop: bool, seed, - luma_concepts: LumaConceptChain = None, + luma_concepts: Optional[LumaConceptChain] = None, ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False, min_length=3) duration = duration if model != LumaVideoModel.ray_1_6 else None resolution = resolution if model != LumaVideoModel.ray_1_6 else None - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/luma/generations", - method=HttpMethod.POST, - request_model=LumaGenerationRequest, - response_model=LumaGeneration, - ), - request=LumaGenerationRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/luma/generations", method="POST"), + response_model=LumaGeneration, + data=LumaGenerationRequest( prompt=prompt, model=model, resolution=resolution, @@ -545,47 +426,25 @@ class LumaTextToVideoGenerationNode(IO.ComfyNode): loop=loop, concepts=luma_concepts.create_api_model() if luma_concepts else None, ), - auth_kwargs=auth_kwargs, ) - response_api: LumaGeneration = await operation.execute() - - if cls.hidden.unique_id: - PromptServer.instance.send_progress_text(f"Luma video generation started: {response_api.id}", cls.hidden.unique_id) - - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/luma/generations/{response_api.id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=LumaGeneration, - ), - completed_statuses=[LumaState.completed], - failed_statuses=[LumaState.failed], + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/luma/generations/{response_api.id}"), + response_model=LumaGeneration, status_extractor=lambda x: x.state, - result_url_extractor=video_result_url_extractor, - node_id=cls.hidden.unique_id, estimated_duration=LUMA_T2V_AVERAGE_DURATION, - auth_kwargs=auth_kwargs, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.assets.video) as vid_response: - return IO.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) + return IO.NodeOutput(await download_url_to_video_output(response_poll.assets.video)) class LumaImageToVideoGenerationNode(IO.ComfyNode): - """ - Generates videos synchronously based on prompt, input images, and output_size. - """ - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="LumaImageToVideoNode", display_name="Luma Image to Video", category="api node/video/Luma", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos synchronously based on prompt, input images, and output_size.", inputs=[ IO.String.Input( "prompt", @@ -637,7 +496,7 @@ class LumaImageToVideoGenerationNode(IO.ComfyNode): "luma_concepts", tooltip="Optional Camera Concepts to dictate camera motion via the Luma Concepts node.", optional=True, - ) + ), ], outputs=[IO.Video.Output()], hidden=[ @@ -662,25 +521,15 @@ class LumaImageToVideoGenerationNode(IO.ComfyNode): luma_concepts: LumaConceptChain = None, ) -> IO.NodeOutput: if first_image is None and last_image is None: - raise Exception( - "At least one of first_image and last_image requires an input." - ) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - keyframes = await cls._convert_to_keyframes(first_image, last_image, auth_kwargs=auth_kwargs) + raise Exception("At least one of first_image and last_image requires an input.") + keyframes = await cls._convert_to_keyframes(first_image, last_image) duration = duration if model != LumaVideoModel.ray_1_6 else None resolution = resolution if model != LumaVideoModel.ray_1_6 else None - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/luma/generations", - method=HttpMethod.POST, - request_model=LumaGenerationRequest, - response_model=LumaGeneration, - ), - request=LumaGenerationRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/luma/generations", method="POST"), + response_model=LumaGeneration, + data=LumaGenerationRequest( prompt=prompt, model=model, aspect_ratio=LumaAspectRatio.ratio_16_9, # ignored, but still needed by the API for some reason @@ -690,54 +539,31 @@ class LumaImageToVideoGenerationNode(IO.ComfyNode): keyframes=keyframes, concepts=luma_concepts.create_api_model() if luma_concepts else None, ), - auth_kwargs=auth_kwargs, ) - response_api: LumaGeneration = await operation.execute() - - if cls.hidden.unique_id: - PromptServer.instance.send_progress_text(f"Luma video generation started: {response_api.id}", cls.hidden.unique_id) - - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/luma/generations/{response_api.id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=LumaGeneration, - ), - completed_statuses=[LumaState.completed], - failed_statuses=[LumaState.failed], + response_poll = await poll_op( + cls, + poll_endpoint=ApiEndpoint(path=f"/proxy/luma/generations/{response_api.id}"), + response_model=LumaGeneration, status_extractor=lambda x: x.state, - result_url_extractor=video_result_url_extractor, - node_id=cls.hidden.unique_id, estimated_duration=LUMA_I2V_AVERAGE_DURATION, - auth_kwargs=auth_kwargs, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.assets.video) as vid_response: - return IO.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) + return IO.NodeOutput(await download_url_to_video_output(response_poll.assets.video)) @classmethod async def _convert_to_keyframes( cls, first_image: torch.Tensor = None, last_image: torch.Tensor = None, - auth_kwargs: Optional[dict[str,str]] = None, ): if first_image is None and last_image is None: return None frame0 = None frame1 = None if first_image is not None: - download_urls = await upload_images_to_comfyapi( - first_image, max_images=1, auth_kwargs=auth_kwargs, - ) + download_urls = await upload_images_to_comfyapi(cls, first_image, max_images=1) frame0 = LumaImageReference(type="image", url=download_urls[0]) if last_image is not None: - download_urls = await upload_images_to_comfyapi( - last_image, max_images=1, auth_kwargs=auth_kwargs, - ) + download_urls = await upload_images_to_comfyapi(cls, last_image, max_images=1) frame1 = LumaImageReference(type="image", url=download_urls[0]) return LumaKeyframes(frame0=frame0, frame1=frame1) diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index e3722e79b..05cbb700f 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -1,71 +1,57 @@ -from inspect import cleandoc from typing import Optional -import logging -import torch +import torch from typing_extensions import override -from comfy_api.latest import ComfyExtension, IO -from comfy_api.input_impl.video_types import VideoFromFile -from comfy_api_nodes.apis import ( + +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.apis.minimax_api import ( + MinimaxFileRetrieveResponse, + MiniMaxModel, + MinimaxTaskResultResponse, MinimaxVideoGenerationRequest, MinimaxVideoGenerationResponse, - MinimaxFileRetrieveResponse, - MinimaxTaskResultResponse, SubjectReferenceItem, - MiniMaxModel, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( - download_url_to_bytesio, + download_url_to_video_output, + poll_op, + sync_op, upload_images_to_comfyapi, + validate_string, ) -from comfy_api_nodes.util import validate_string -from server import PromptServer - I2V_AVERAGE_DURATION = 114 T2V_AVERAGE_DURATION = 234 async def _generate_mm_video( + cls: type[IO.ComfyNode], *, - auth: dict[str, str], - node_id: str, prompt_text: str, seed: int, model: str, - image: Optional[torch.Tensor] = None, # used for ImageToVideo - subject: Optional[torch.Tensor] = None, # used for SubjectToVideo + image: Optional[torch.Tensor] = None, # used for ImageToVideo + subject: Optional[torch.Tensor] = None, # used for SubjectToVideo average_duration: Optional[int] = None, ) -> IO.NodeOutput: if image is None: validate_string(prompt_text, field_name="prompt_text") - # upload image, if passed in image_url = None if image is not None: - image_url = (await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=auth))[0] + image_url = (await upload_images_to_comfyapi(cls, image, max_images=1))[0] # TODO: figure out how to deal with subject properly, API returns invalid params when using S2V-01 model subject_reference = None if subject is not None: - subject_url = (await upload_images_to_comfyapi(subject, max_images=1, auth_kwargs=auth))[0] + subject_url = (await upload_images_to_comfyapi(cls, subject, max_images=1))[0] subject_reference = [SubjectReferenceItem(image=subject_url)] - - video_generate_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/minimax/video_generation", - method=HttpMethod.POST, - request_model=MinimaxVideoGenerationRequest, - response_model=MinimaxVideoGenerationResponse, - ), - request=MinimaxVideoGenerationRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/minimax/video_generation", method="POST"), + response_model=MinimaxVideoGenerationResponse, + data=MinimaxVideoGenerationRequest( model=MiniMaxModel(model), prompt=prompt_text, callback_url=None, @@ -73,81 +59,50 @@ async def _generate_mm_video( subject_reference=subject_reference, prompt_optimizer=None, ), - auth_kwargs=auth, ) - response = await video_generate_operation.execute() task_id = response.task_id if not task_id: raise Exception(f"MiniMax generation failed: {response.base_resp}") - video_generate_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path="/proxy/minimax/query/video_generation", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MinimaxTaskResultResponse, - query_params={"task_id": task_id}, - ), - completed_statuses=["Success"], - failed_statuses=["Fail"], + task_result = await poll_op( + cls, + ApiEndpoint(path="/proxy/minimax/query/video_generation", query_params={"task_id": task_id}), + response_model=MinimaxTaskResultResponse, status_extractor=lambda x: x.status.value, estimated_duration=average_duration, - node_id=node_id, - auth_kwargs=auth, ) - task_result = await video_generate_operation.execute() file_id = task_result.file_id if file_id is None: raise Exception("Request was not successful. Missing file ID.") - file_retrieve_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/minimax/files/retrieve", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MinimaxFileRetrieveResponse, - query_params={"file_id": int(file_id)}, - ), - request=EmptyRequest(), - auth_kwargs=auth, + file_result = await sync_op( + cls, + ApiEndpoint(path="/proxy/minimax/files/retrieve", query_params={"file_id": int(file_id)}), + response_model=MinimaxFileRetrieveResponse, ) - file_result = await file_retrieve_operation.execute() file_url = file_result.file.download_url if file_url is None: - raise Exception( - f"No video was found in the response. Full response: {file_result.model_dump()}" - ) - logging.info("Generated video URL: %s", file_url) - if node_id: - if hasattr(file_result.file, "backup_download_url"): - message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}" - else: - message = f"Result URL: {file_url}" - PromptServer.instance.send_progress_text(message, node_id) - - # Download and return as VideoFromFile - video_io = await download_url_to_bytesio(file_url) - if video_io is None: - error_msg = f"Failed to download video from {file_url}" - logging.error(error_msg) - raise Exception(error_msg) - return IO.NodeOutput(VideoFromFile(video_io)) + raise Exception(f"No video was found in the response. Full response: {file_result.model_dump()}") + if file_result.file.backup_download_url: + try: + return IO.NodeOutput(await download_url_to_video_output(file_url, timeout=10, max_retries=2)) + except Exception: # if we have a second URL to retrieve the result, try again using that one + return IO.NodeOutput( + await download_url_to_video_output(file_result.file.backup_download_url, max_retries=3) + ) + return IO.NodeOutput(await download_url_to_video_output(file_url)) class MinimaxTextToVideoNode(IO.ComfyNode): - """ - Generates videos synchronously based on a prompt, and optional parameters using MiniMax's API. - """ - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="MinimaxTextToVideoNode", display_name="MiniMax Text to Video", category="api node/video/MiniMax", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos synchronously based on a prompt, and optional parameters.", inputs=[ IO.String.Input( "prompt_text", @@ -189,11 +144,7 @@ class MinimaxTextToVideoNode(IO.ComfyNode): seed: int = 0, ) -> IO.NodeOutput: return await _generate_mm_video( - auth={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, prompt_text=prompt_text, seed=seed, model=model, @@ -204,17 +155,13 @@ class MinimaxTextToVideoNode(IO.ComfyNode): class MinimaxImageToVideoNode(IO.ComfyNode): - """ - Generates videos synchronously based on an image and prompt, and optional parameters using MiniMax's API. - """ - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="MinimaxImageToVideoNode", display_name="MiniMax Image to Video", category="api node/video/MiniMax", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos synchronously based on an image and prompt, and optional parameters.", inputs=[ IO.Image.Input( "image", @@ -261,11 +208,7 @@ class MinimaxImageToVideoNode(IO.ComfyNode): seed: int = 0, ) -> IO.NodeOutput: return await _generate_mm_video( - auth={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, prompt_text=prompt_text, seed=seed, model=model, @@ -276,17 +219,13 @@ class MinimaxImageToVideoNode(IO.ComfyNode): class MinimaxSubjectToVideoNode(IO.ComfyNode): - """ - Generates videos synchronously based on an image and prompt, and optional parameters using MiniMax's API. - """ - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="MinimaxSubjectToVideoNode", display_name="MiniMax Subject to Video", category="api node/video/MiniMax", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos synchronously based on an image and prompt, and optional parameters.", inputs=[ IO.Image.Input( "subject", @@ -333,11 +272,7 @@ class MinimaxSubjectToVideoNode(IO.ComfyNode): seed: int = 0, ) -> IO.NodeOutput: return await _generate_mm_video( - auth={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, prompt_text=prompt_text, seed=seed, model=model, @@ -348,15 +283,13 @@ class MinimaxSubjectToVideoNode(IO.ComfyNode): class MinimaxHailuoVideoNode(IO.ComfyNode): - """Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.""" - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="MinimaxHailuoVideoNode", display_name="MiniMax Hailuo Video", category="api node/video/MiniMax", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.", inputs=[ IO.String.Input( "prompt_text", @@ -420,10 +353,6 @@ class MinimaxHailuoVideoNode(IO.ComfyNode): resolution: str = "768P", model: str = "MiniMax-Hailuo-02", ) -> IO.NodeOutput: - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } if first_frame_image is None: validate_string(prompt_text, field_name="prompt_text") @@ -435,16 +364,13 @@ class MinimaxHailuoVideoNode(IO.ComfyNode): # upload image, if passed in image_url = None if first_frame_image is not None: - image_url = (await upload_images_to_comfyapi(first_frame_image, max_images=1, auth_kwargs=auth))[0] + image_url = (await upload_images_to_comfyapi(cls, first_frame_image, max_images=1))[0] - video_generate_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/minimax/video_generation", - method=HttpMethod.POST, - request_model=MinimaxVideoGenerationRequest, - response_model=MinimaxVideoGenerationResponse, - ), - request=MinimaxVideoGenerationRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/minimax/video_generation", method="POST"), + response_model=MinimaxVideoGenerationResponse, + data=MinimaxVideoGenerationRequest( model=MiniMaxModel(model), prompt=prompt_text, callback_url=None, @@ -453,67 +379,42 @@ class MinimaxHailuoVideoNode(IO.ComfyNode): duration=duration, resolution=resolution, ), - auth_kwargs=auth, ) - response = await video_generate_operation.execute() task_id = response.task_id if not task_id: raise Exception(f"MiniMax generation failed: {response.base_resp}") average_duration = 120 if resolution == "768P" else 240 - video_generate_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path="/proxy/minimax/query/video_generation", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MinimaxTaskResultResponse, - query_params={"task_id": task_id}, - ), - completed_statuses=["Success"], - failed_statuses=["Fail"], + task_result = await poll_op( + cls, + ApiEndpoint(path="/proxy/minimax/query/video_generation", query_params={"task_id": task_id}), + response_model=MinimaxTaskResultResponse, status_extractor=lambda x: x.status.value, estimated_duration=average_duration, - node_id=cls.hidden.unique_id, - auth_kwargs=auth, ) - task_result = await video_generate_operation.execute() file_id = task_result.file_id if file_id is None: raise Exception("Request was not successful. Missing file ID.") - file_retrieve_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/minimax/files/retrieve", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MinimaxFileRetrieveResponse, - query_params={"file_id": int(file_id)}, - ), - request=EmptyRequest(), - auth_kwargs=auth, + file_result = await sync_op( + cls, + ApiEndpoint(path="/proxy/minimax/files/retrieve", query_params={"file_id": int(file_id)}), + response_model=MinimaxFileRetrieveResponse, ) - file_result = await file_retrieve_operation.execute() file_url = file_result.file.download_url if file_url is None: - raise Exception( - f"No video was found in the response. Full response: {file_result.model_dump()}" - ) - logging.info("Generated video URL: %s", file_url) - if cls.hidden.unique_id: - if hasattr(file_result.file, "backup_download_url"): - message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}" - else: - message = f"Result URL: {file_url}" - PromptServer.instance.send_progress_text(message, cls.hidden.unique_id) + raise Exception(f"No video was found in the response. Full response: {file_result.model_dump()}") - video_io = await download_url_to_bytesio(file_url) - if video_io is None: - error_msg = f"Failed to download video from {file_url}" - logging.error(error_msg) - raise Exception(error_msg) - return IO.NodeOutput(VideoFromFile(video_io)) + if file_result.file.backup_download_url: + try: + return IO.NodeOutput(await download_url_to_video_output(file_url, timeout=10, max_retries=2)) + except Exception: # if we have a second URL to retrieve the result, try again using that one + return IO.NodeOutput( + await download_url_to_video_output(file_result.file.backup_download_url, max_retries=3) + ) + return IO.NodeOutput(await download_url_to_video_output(file_url)) class MinimaxExtension(ComfyExtension): diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py index 9c036d64b..9ae512fe5 100644 --- a/comfy_api_nodes/util/client.py +++ b/comfy_api_nodes/util/client.py @@ -78,7 +78,7 @@ class _PollUIState: _RETRY_STATUS = {408, 429, 500, 502, 503, 504} COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed"] -FAILED_STATUSES = ["cancelled", "canceled", "failed", "error"] +FAILED_STATUSES = ["cancelled", "canceled", "fail", "failed", "error"] QUEUED_STATUSES = ["created", "queued", "queueing", "submitted"] diff --git a/comfy_api_nodes/util/download_helpers.py b/comfy_api_nodes/util/download_helpers.py index f89045e12..364874bed 100644 --- a/comfy_api_nodes/util/download_helpers.py +++ b/comfy_api_nodes/util/download_helpers.py @@ -232,11 +232,12 @@ async def download_url_to_video_output( video_url: str, *, timeout: float = None, + max_retries: int = 5, cls: type[COMFY_IO.ComfyNode] = None, ) -> VideoFromFile: """Downloads a video from a URL and returns a `VIDEO` output.""" result = BytesIO() - await download_url_to_bytesio(video_url, result, timeout=timeout, cls=cls) + await download_url_to_bytesio(video_url, result, timeout=timeout, max_retries=max_retries, cls=cls) return VideoFromFile(result) From 1a58087ac2eb64be3645934d0025aafaa5bdce38 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 29 Oct 2025 12:43:51 -0700 Subject: [PATCH 508/759] Reduce memory usage for fp8 scaled op. (#10531) --- comfy/quant_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index b14e03084..fb35a0d40 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -358,7 +358,7 @@ class TensorCoreFP8Layout(QuantizedLayout): scale = scale.to(device=tensor.device, dtype=torch.float32) lp_amax = torch.finfo(dtype).max - tensor_scaled = tensor.float() / scale + tensor_scaled = tensor * (1.0 / scale).to(tensor.dtype) torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled) qdata = tensor_scaled.to(dtype, memory_format=torch.contiguous_format) From ec4fc2a09a390d0d81500c51fb9e4d8a7a5ce1fc Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 29 Oct 2025 12:48:06 -0700 Subject: [PATCH 509/759] Fix case of weights not being unpinned. (#10533) --- comfy/model_patcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index aec73349c..119119e96 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1283,5 +1283,6 @@ class ModelPatcher: self.clear_cached_hook_weights() def __del__(self): + self.unpin_all_weights() self.detach(unpatch_all=False) From ab7ab5be23fb9b71d1790f424e7dcf91dc1fe0cc Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Thu, 30 Oct 2025 07:17:46 +1000 Subject: [PATCH 510/759] Fix Race condition in --async-offload that can cause corruption (#10501) * mm: factor out the current stream getter Make this a reusable function. * ops: sync the offload stream with the consumption of w&b This sync is nessacary as pytorch will queue cuda async frees on the same stream as created to tensor. In the case of async offload, this will be on the offload stream. Weights and biases can go out of scope in python which then triggers the pytorch garbage collector to queue the free operation on the offload stream possible before the compute stream has used the weight. This causes a use after free on weight data leading to total corruption of some workflows. So sync the offload stream with the compute stream after the weight has been used so the free has to wait for the weight to be used. The cast_bias_weight is extended in a backwards compatible way with the new behaviour opt-in on a defaulted parameter. This handles custom node packs calling cast_bias_weight and defeatures async-offload for them (as they do not handle the race). The pattern is now: cast_bias_weight(... , offloadable=True) #This might be offloaded thing(weight, bias, ...) uncast_bias_weight(...) * controlnet: adopt new cast_bias_weight synchronization scheme This is nessacary for safe async weight offloading. * mm: sync the last stream in the queue, not the next Currently this peeks ahead to sync the next stream in the queue of streams with the compute stream. This doesnt allow a lot of parallelization, as then end result is you can only get one weight load ahead regardless of how many streams you have. Rotate the loop logic here to synchronize the end of the queue before returning the next stream. This allows weights to be loaded ahead of the compute streams position. --- comfy/controlnet.py | 17 +++--- comfy/model_management.py | 28 +++++---- comfy/ops.py | 121 ++++++++++++++++++++++++++++---------- 3 files changed, 114 insertions(+), 52 deletions(-) diff --git a/comfy/controlnet.py b/comfy/controlnet.py index f08ff4b36..0b5e30f52 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -310,11 +310,13 @@ class ControlLoraOps: self.bias = None def forward(self, input): - weight, bias = comfy.ops.cast_bias_weight(self, input) + weight, bias, offload_stream = comfy.ops.cast_bias_weight(self, input, offloadable=True) if self.up is not None: - return torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias) + x = torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias) else: - return torch.nn.functional.linear(input, weight, bias) + x = torch.nn.functional.linear(input, weight, bias) + comfy.ops.uncast_bias_weight(self, weight, bias, offload_stream) + return x class Conv2d(torch.nn.Module, comfy.ops.CastWeightBiasOp): def __init__( @@ -350,12 +352,13 @@ class ControlLoraOps: def forward(self, input): - weight, bias = comfy.ops.cast_bias_weight(self, input) + weight, bias, offload_stream = comfy.ops.cast_bias_weight(self, input, offloadable=True) if self.up is not None: - return torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups) + x = torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups) else: - return torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups) - + x = torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups) + comfy.ops.uncast_bias_weight(self, weight, bias, offload_stream) + return x class ControlLora(ControlNet): def __init__(self, control_weights, global_average_pooling=False, model_options={}): #TODO? model_options diff --git a/comfy/model_management.py b/comfy/model_management.py index 3e5b977d4..79c0dfdb4 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1013,6 +1013,16 @@ if args.async_offload: NUM_STREAMS = 2 logging.info("Using async weight offloading with {} streams".format(NUM_STREAMS)) +def current_stream(device): + if device is None: + return None + if is_device_cuda(device): + return torch.cuda.current_stream() + elif is_device_xpu(device): + return torch.xpu.current_stream() + else: + return None + stream_counters = {} def get_offload_stream(device): stream_counter = stream_counters.get(device, 0) @@ -1021,21 +1031,17 @@ def get_offload_stream(device): if device in STREAMS: ss = STREAMS[device] - s = ss[stream_counter] + #Sync the oldest stream in the queue with the current + ss[stream_counter].wait_stream(current_stream(device)) stream_counter = (stream_counter + 1) % len(ss) - if is_device_cuda(device): - ss[stream_counter].wait_stream(torch.cuda.current_stream()) - elif is_device_xpu(device): - ss[stream_counter].wait_stream(torch.xpu.current_stream()) stream_counters[device] = stream_counter - return s + return ss[stream_counter] elif is_device_cuda(device): ss = [] for k in range(NUM_STREAMS): ss.append(torch.cuda.Stream(device=device, priority=0)) STREAMS[device] = ss s = ss[stream_counter] - stream_counter = (stream_counter + 1) % len(ss) stream_counters[device] = stream_counter return s elif is_device_xpu(device): @@ -1044,18 +1050,14 @@ def get_offload_stream(device): ss.append(torch.xpu.Stream(device=device, priority=0)) STREAMS[device] = ss s = ss[stream_counter] - stream_counter = (stream_counter + 1) % len(ss) stream_counters[device] = stream_counter return s return None def sync_stream(device, stream): - if stream is None: + if stream is None or current_stream(device) is None: return - if is_device_cuda(device): - torch.cuda.current_stream().wait_stream(stream) - elif is_device_xpu(device): - torch.xpu.current_stream().wait_stream(stream) + current_stream(device).wait_stream(stream) def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None): if device is None or weight.device == device: diff --git a/comfy/ops.py b/comfy/ops.py index 93731eedf..71ca7a2bd 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -70,8 +70,12 @@ cast_to = comfy.model_management.cast_to #TODO: remove once no more references def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) + @torch.compiler.disable() -def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None): +def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False): + # NOTE: offloadable=False is a a legacy and if you are a custom node author reading this please pass + # offloadable=True and call uncast_bias_weight() after your last usage of the weight/bias. This + # will add async-offload support to your cast and improve performance. if input is not None: if dtype is None: dtype = input.dtype @@ -80,7 +84,11 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None): if device is None: device = input.device - offload_stream = comfy.model_management.get_offload_stream(device) + if offloadable: + offload_stream = comfy.model_management.get_offload_stream(device) + else: + offload_stream = None + if offload_stream is not None: wf_context = offload_stream else: @@ -105,7 +113,24 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None): weight = f(weight) comfy.model_management.sync_stream(device, offload_stream) - return weight, bias + if offloadable: + return weight, bias, offload_stream + else: + #Legacy function signature + return weight, bias + + +def uncast_bias_weight(s, weight, bias, offload_stream): + if offload_stream is None: + return + if weight is not None: + device = weight.device + else: + if bias is None: + return + device = bias.device + offload_stream.wait_stream(comfy.model_management.current_stream(device)) + class CastWeightBiasOp: comfy_cast_weights = False @@ -118,8 +143,10 @@ class disable_weight_init: return None def forward_comfy_cast_weights(self, input): - weight, bias = cast_bias_weight(self, input) - return torch.nn.functional.linear(input, weight, bias) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = torch.nn.functional.linear(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): run_every_op() @@ -133,8 +160,10 @@ class disable_weight_init: return None def forward_comfy_cast_weights(self, input): - weight, bias = cast_bias_weight(self, input) - return self._conv_forward(input, weight, bias) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = self._conv_forward(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): run_every_op() @@ -148,8 +177,10 @@ class disable_weight_init: return None def forward_comfy_cast_weights(self, input): - weight, bias = cast_bias_weight(self, input) - return self._conv_forward(input, weight, bias) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = self._conv_forward(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): run_every_op() @@ -172,8 +203,10 @@ class disable_weight_init: return super()._conv_forward(input, weight, bias, *args, **kwargs) def forward_comfy_cast_weights(self, input): - weight, bias = cast_bias_weight(self, input) - return self._conv_forward(input, weight, bias) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = self._conv_forward(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): run_every_op() @@ -187,8 +220,10 @@ class disable_weight_init: return None def forward_comfy_cast_weights(self, input): - weight, bias = cast_bias_weight(self, input) - return torch.nn.functional.group_norm(input, self.num_groups, weight, bias, self.eps) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = torch.nn.functional.group_norm(input, self.num_groups, weight, bias, self.eps) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): run_every_op() @@ -203,11 +238,14 @@ class disable_weight_init: def forward_comfy_cast_weights(self, input): if self.weight is not None: - weight, bias = cast_bias_weight(self, input) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) else: weight = None bias = None - return torch.nn.functional.layer_norm(input, self.normalized_shape, weight, bias, self.eps) + offload_stream = None + x = torch.nn.functional.layer_norm(input, self.normalized_shape, weight, bias, self.eps) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): run_every_op() @@ -223,11 +261,15 @@ class disable_weight_init: def forward_comfy_cast_weights(self, input): if self.weight is not None: - weight, bias = cast_bias_weight(self, input) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) else: weight = None - return comfy.rmsnorm.rms_norm(input, weight, self.eps) # TODO: switch to commented out line when old torch is deprecated - # return torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps) + bias = None + offload_stream = None + x = comfy.rmsnorm.rms_norm(input, weight, self.eps) # TODO: switch to commented out line when old torch is deprecated + # x = torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): run_every_op() @@ -246,10 +288,12 @@ class disable_weight_init: input, output_size, self.stride, self.padding, self.kernel_size, num_spatial_dims, self.dilation) - weight, bias = cast_bias_weight(self, input) - return torch.nn.functional.conv_transpose2d( + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = torch.nn.functional.conv_transpose2d( input, weight, bias, self.stride, self.padding, output_padding, self.groups, self.dilation) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): run_every_op() @@ -268,10 +312,12 @@ class disable_weight_init: input, output_size, self.stride, self.padding, self.kernel_size, num_spatial_dims, self.dilation) - weight, bias = cast_bias_weight(self, input) - return torch.nn.functional.conv_transpose1d( + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = torch.nn.functional.conv_transpose1d( input, weight, bias, self.stride, self.padding, output_padding, self.groups, self.dilation) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): run_every_op() @@ -289,8 +335,11 @@ class disable_weight_init: output_dtype = out_dtype if self.weight.dtype == torch.float16 or self.weight.dtype == torch.bfloat16: out_dtype = None - weight, bias = cast_bias_weight(self, device=input.device, dtype=out_dtype) - return torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype) + weight, bias, offload_stream = cast_bias_weight(self, device=input.device, dtype=out_dtype, offloadable=True) + x = torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype) + uncast_bias_weight(self, weight, bias, offload_stream) + return x + def forward(self, *args, **kwargs): run_every_op() @@ -361,7 +410,7 @@ def fp8_linear(self, input): input_dtype = input.dtype if len(input.shape) == 3: - w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype) + w, bias, offload_stream = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype, offloadable=True) scale_weight = self.scale_weight scale_input = self.scale_input @@ -382,6 +431,8 @@ def fp8_linear(self, input): quantized_input = QuantizedTensor.from_float(input.reshape(-1, input_shape[2]), TensorCoreFP8Layout, scale=scale_input, dtype=dtype) o = torch.nn.functional.linear(quantized_input, quantized_weight, bias) + uncast_bias_weight(self, w, bias, offload_stream) + if tensor_2d: return o.reshape(input_shape[0], -1) return o.reshape((-1, input_shape[1], self.weight.shape[0])) @@ -404,8 +455,10 @@ class fp8_ops(manual_cast): except Exception as e: logging.info("Exception during fp8 op: {}".format(e)) - weight, bias = cast_bias_weight(self, input) - return torch.nn.functional.linear(input, weight, bias) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = torch.nn.functional.linear(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None): logging.info("Using scaled fp8: fp8 matrix mult: {}, scale input: {}".format(fp8_matrix_mult, scale_input)) @@ -433,12 +486,14 @@ def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None if out is not None: return out - weight, bias = cast_bias_weight(self, input) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) if weight.numel() < input.numel(): #TODO: optimize - return torch.nn.functional.linear(input, weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype), bias) + x = torch.nn.functional.linear(input, weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype), bias) else: - return torch.nn.functional.linear(input * self.scale_weight.to(device=weight.device, dtype=weight.dtype), weight, bias) + x = torch.nn.functional.linear(input * self.scale_weight.to(device=weight.device, dtype=weight.dtype), weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def convert_weight(self, weight, inplace=False, **kwargs): if inplace: @@ -577,8 +632,10 @@ class MixedPrecisionOps(disable_weight_init): return torch.nn.functional.linear(input, weight, bias) def forward_comfy_cast_weights(self, input): - weight, bias = cast_bias_weight(self, input) - return self._forward(input, weight, bias) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = self._forward(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, input, *args, **kwargs): run_every_op() From 25de7b1bfa22dd98922f047a1342cc97f8e46c5b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 29 Oct 2025 14:20:27 -0700 Subject: [PATCH 511/759] Try to fix slow load issue on low ram hardware with pinned mem. (#10536) --- comfy/model_patcher.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 119119e96..74b9e48bc 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -658,6 +658,7 @@ class ModelPatcher: loading = self._load_list() load_completely = [] + offloaded = [] loading.sort(reverse=True) for x in loading: n = x[1] @@ -699,8 +700,7 @@ class ModelPatcher: patch_counter += 1 cast_weight = True - for param in params: - self.pin_weight_to_device("{}.{}".format(n, param)) + offloaded.append((module_mem, n, m, params)) else: if hasattr(m, "comfy_cast_weights"): wipe_lowvram_weight(m) @@ -741,6 +741,12 @@ class ModelPatcher: for x in load_completely: x[2].to(device_to) + for x in offloaded: + n = x[1] + params = x[3] + for param in params: + self.pin_weight_to_device("{}.{}".format(n, param)) + if lowvram_counter > 0: logging.info("loaded partially {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), patch_counter)) self.model.model_lowvram = True From 906c0899575a83ac69bb095e835fdec748891da4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 29 Oct 2025 16:29:01 -0700 Subject: [PATCH 512/759] Fix small performance regression with fp8 fast and scaled fp8. (#10537) --- comfy/ops.py | 6 +++++- comfy/quant_ops.py | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 71ca7a2bd..18f6b804b 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -421,14 +421,18 @@ def fp8_linear(self, input): if scale_input is None: scale_input = torch.ones((), device=input.device, dtype=torch.float32) + input = torch.clamp(input, min=-448, max=448, out=input) + input = input.reshape(-1, input_shape[2]).to(dtype).contiguous() + layout_params_weight = {'scale': scale_input, 'orig_dtype': input_dtype} + quantized_input = QuantizedTensor(input.reshape(-1, input_shape[2]).to(dtype).contiguous(), TensorCoreFP8Layout, layout_params_weight) else: scale_input = scale_input.to(input.device) + quantized_input = QuantizedTensor.from_float(input.reshape(-1, input_shape[2]), TensorCoreFP8Layout, scale=scale_input, dtype=dtype) # Wrap weight in QuantizedTensor - this enables unified dispatch # Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py! layout_params_weight = {'scale': scale_weight, 'orig_dtype': input_dtype} quantized_weight = QuantizedTensor(w, TensorCoreFP8Layout, layout_params_weight) - quantized_input = QuantizedTensor.from_float(input.reshape(-1, input_shape[2]), TensorCoreFP8Layout, scale=scale_input, dtype=dtype) o = torch.nn.functional.linear(quantized_input, quantized_weight, bias) uncast_bias_weight(self, w, bias, offload_stream) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index fb35a0d40..c822fe53c 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -357,9 +357,10 @@ class TensorCoreFP8Layout(QuantizedLayout): scale = torch.tensor(scale) scale = scale.to(device=tensor.device, dtype=torch.float32) - lp_amax = torch.finfo(dtype).max tensor_scaled = tensor * (1.0 / scale).to(tensor.dtype) - torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled) + # TODO: uncomment this if it's actually needed because the clamp has a small performance penality' + # lp_amax = torch.finfo(dtype).max + # torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled) qdata = tensor_scaled.to(dtype, memory_format=torch.contiguous_format) layout_params = { From 998bf60bebd03e57a55e106434657849342b733f Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Wed, 29 Oct 2025 16:37:06 -0700 Subject: [PATCH 513/759] Add units/info for the numbers displayed on 'load completely' and 'load partially' log messages (#10538) --- comfy/model_patcher.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 74b9e48bc..ed3f3f5cb 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -655,6 +655,7 @@ class ModelPatcher: mem_counter = 0 patch_counter = 0 lowvram_counter = 0 + lowvram_mem_counter = 0 loading = self._load_list() load_completely = [] @@ -675,6 +676,7 @@ class ModelPatcher: if mem_counter + module_mem >= lowvram_model_memory: lowvram_weight = True lowvram_counter += 1 + lowvram_mem_counter += module_mem if hasattr(m, "prev_comfy_cast_weights"): #Already lowvramed continue @@ -748,10 +750,10 @@ class ModelPatcher: self.pin_weight_to_device("{}.{}".format(n, param)) if lowvram_counter > 0: - logging.info("loaded partially {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), patch_counter)) + logging.info("loaded partially; {:.2f} MB usable, {:.2f} MB loaded, {:.2f} MB offloaded, lowvram patches: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), lowvram_mem_counter / (1024 * 1024), patch_counter)) self.model.model_lowvram = True else: - logging.info("loaded completely {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load)) + logging.info("loaded completely; {:.2f} MB usable, {:.2f} MB loaded, full load: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load)) self.model.model_lowvram = False if full_load: self.model.to(device_to) From 163b629c70a349c7d1e91eebc5365713e770af8a Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 30 Oct 2025 08:49:03 +0200 Subject: [PATCH 514/759] use new API client in Pixverse and Ideogram nodes (#10543) --- comfy_api_nodes/apinode_utils.py | 109 +------------ comfy_api_nodes/nodes_bfl.py | 43 +---- comfy_api_nodes/nodes_bytedance.py | 12 +- comfy_api_nodes/nodes_ideogram.py | 137 ++++------------ comfy_api_nodes/nodes_kling.py | 2 +- comfy_api_nodes/nodes_pixverse.py | 194 ++++++----------------- comfy_api_nodes/nodes_recraft.py | 4 +- comfy_api_nodes/nodes_runway.py | 10 +- comfy_api_nodes/nodes_vidu.py | 12 +- comfy_api_nodes/util/__init__.py | 10 +- comfy_api_nodes/util/conversions.py | 21 +++ comfy_api_nodes/util/validation_utils.py | 125 ++++++++++----- 12 files changed, 220 insertions(+), 459 deletions(-) diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py index 6a72b9d1d..ecd604ff8 100644 --- a/comfy_api_nodes/apinode_utils.py +++ b/comfy_api_nodes/apinode_utils.py @@ -1,15 +1,12 @@ from __future__ import annotations import aiohttp import mimetypes -from typing import Optional, Union -from comfy.utils import common_upscale +from typing import Union from server import PromptServer -from comfy.cli_args import args import numpy as np from PIL import Image import torch -import math import base64 from io import BytesIO @@ -60,85 +57,6 @@ async def validate_and_cast_response( return torch.stack(image_tensors, dim=0) -def validate_aspect_ratio( - aspect_ratio: str, - minimum_ratio: float, - maximum_ratio: float, - minimum_ratio_str: str, - maximum_ratio_str: str, -) -> float: - """Validates and casts an aspect ratio string to a float. - - Args: - aspect_ratio: The aspect ratio string to validate. - minimum_ratio: The minimum aspect ratio. - maximum_ratio: The maximum aspect ratio. - minimum_ratio_str: The minimum aspect ratio string. - maximum_ratio_str: The maximum aspect ratio string. - - Returns: - The validated and cast aspect ratio. - - Raises: - Exception: If the aspect ratio is not valid. - """ - # get ratio values - numbers = aspect_ratio.split(":") - if len(numbers) != 2: - raise TypeError( - f"Aspect ratio must be in the format X:Y, such as 16:9, but was {aspect_ratio}." - ) - try: - numerator = int(numbers[0]) - denominator = int(numbers[1]) - except ValueError as exc: - raise TypeError( - f"Aspect ratio must contain numbers separated by ':', such as 16:9, but was {aspect_ratio}." - ) from exc - calculated_ratio = numerator / denominator - # if not close to minimum and maximum, check bounds - if not math.isclose(calculated_ratio, minimum_ratio) or not math.isclose( - calculated_ratio, maximum_ratio - ): - if calculated_ratio < minimum_ratio: - raise TypeError( - f"Aspect ratio cannot reduce to any less than {minimum_ratio_str} ({minimum_ratio}), but was {aspect_ratio} ({calculated_ratio})." - ) - if calculated_ratio > maximum_ratio: - raise TypeError( - f"Aspect ratio cannot reduce to any greater than {maximum_ratio_str} ({maximum_ratio}), but was {aspect_ratio} ({calculated_ratio})." - ) - return aspect_ratio - - -async def download_url_to_bytesio( - url: str, timeout: int = None, auth_kwargs: Optional[dict[str, str]] = None -) -> BytesIO: - """Downloads content from a URL using requests and returns it as BytesIO. - - Args: - url: The URL to download. - timeout: Request timeout in seconds. Defaults to None (no timeout). - - Returns: - BytesIO object containing the downloaded content. - """ - headers = {} - if url.startswith("/proxy/"): - url = str(args.comfy_api_base).rstrip("/") + url - auth_token = auth_kwargs.get("auth_token") - comfy_api_key = auth_kwargs.get("comfy_api_key") - if auth_token: - headers["Authorization"] = f"Bearer {auth_token}" - elif comfy_api_key: - headers["X-API-KEY"] = comfy_api_key - timeout_cfg = aiohttp.ClientTimeout(total=timeout) if timeout else None - async with aiohttp.ClientSession(timeout=timeout_cfg) as session: - async with session.get(url, headers=headers) as resp: - resp.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX) - return BytesIO(await resp.read()) - - def text_filepath_to_base64_string(filepath: str) -> str: """Converts a text file to a base64 string.""" with open(filepath, "rb") as f: @@ -153,28 +71,3 @@ def text_filepath_to_data_uri(filepath: str) -> str: if mime_type is None: mime_type = "application/octet-stream" return f"data:{mime_type};base64,{base64_string}" - - -def resize_mask_to_image( - mask: torch.Tensor, - image: torch.Tensor, - upscale_method="nearest-exact", - crop="disabled", - allow_gradient=True, - add_channel_dim=False, -): - """ - Resize mask to be the same dimensions as an image, while maintaining proper format for API calls. - """ - _, H, W, _ = image.shape - mask = mask.unsqueeze(-1) - mask = mask.movedim(-1, 1) - mask = common_upscale( - mask, width=W, height=H, upscale_method=upscale_method, crop=crop - ) - mask = mask.movedim(1, -1) - if not add_channel_dim: - mask = mask.squeeze(-1) - if not allow_gradient: - mask = (mask > 0.5).float() - return mask diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py index baa74fd52..1740fb377 100644 --- a/comfy_api_nodes/nodes_bfl.py +++ b/comfy_api_nodes/nodes_bfl.py @@ -5,10 +5,6 @@ import torch from typing_extensions import override from comfy_api.latest import IO, ComfyExtension -from comfy_api_nodes.apinode_utils import ( - resize_mask_to_image, - validate_aspect_ratio, -) from comfy_api_nodes.apis.bfl_api import ( BFLFluxExpandImageRequest, BFLFluxFillImageRequest, @@ -23,8 +19,10 @@ from comfy_api_nodes.util import ( ApiEndpoint, download_url_to_image_tensor, poll_op, + resize_mask_to_image, sync_op, tensor_to_base64_string, + validate_aspect_ratio_string, validate_string, ) @@ -43,11 +41,6 @@ class FluxProUltraImageNode(IO.ComfyNode): Generates images using Flux Pro 1.1 Ultra via api based on prompt and resolution. """ - MINIMUM_RATIO = 1 / 4 - MAXIMUM_RATIO = 4 / 1 - MINIMUM_RATIO_STR = "1:4" - MAXIMUM_RATIO_STR = "4:1" - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( @@ -112,16 +105,7 @@ class FluxProUltraImageNode(IO.ComfyNode): @classmethod def validate_inputs(cls, aspect_ratio: str): - try: - validate_aspect_ratio( - aspect_ratio, - minimum_ratio=cls.MINIMUM_RATIO, - maximum_ratio=cls.MAXIMUM_RATIO, - minimum_ratio_str=cls.MINIMUM_RATIO_STR, - maximum_ratio_str=cls.MAXIMUM_RATIO_STR, - ) - except Exception as e: - return str(e) + validate_aspect_ratio_string(aspect_ratio, (1, 4), (4, 1)) return True @classmethod @@ -145,13 +129,7 @@ class FluxProUltraImageNode(IO.ComfyNode): prompt=prompt, prompt_upsampling=prompt_upsampling, seed=seed, - aspect_ratio=validate_aspect_ratio( - aspect_ratio, - minimum_ratio=cls.MINIMUM_RATIO, - maximum_ratio=cls.MAXIMUM_RATIO, - minimum_ratio_str=cls.MINIMUM_RATIO_STR, - maximum_ratio_str=cls.MAXIMUM_RATIO_STR, - ), + aspect_ratio=aspect_ratio, raw=raw, image_prompt=(image_prompt if image_prompt is None else tensor_to_base64_string(image_prompt)), image_prompt_strength=(None if image_prompt is None else round(image_prompt_strength, 2)), @@ -180,11 +158,6 @@ class FluxKontextProImageNode(IO.ComfyNode): Edits images using Flux.1 Kontext [pro] via api based on prompt and aspect ratio. """ - MINIMUM_RATIO = 1 / 4 - MAXIMUM_RATIO = 4 / 1 - MINIMUM_RATIO_STR = "1:4" - MAXIMUM_RATIO_STR = "4:1" - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( @@ -261,13 +234,7 @@ class FluxKontextProImageNode(IO.ComfyNode): seed=0, prompt_upsampling=False, ) -> IO.NodeOutput: - aspect_ratio = validate_aspect_ratio( - aspect_ratio, - minimum_ratio=cls.MINIMUM_RATIO, - maximum_ratio=cls.MAXIMUM_RATIO, - minimum_ratio_str=cls.MINIMUM_RATIO_STR, - maximum_ratio_str=cls.MAXIMUM_RATIO_STR, - ) + validate_aspect_ratio_string(aspect_ratio, (1, 4), (4, 1)) if input_image is None: validate_string(prompt, strip_whitespace=False) initial_response = await sync_op( diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index 534af380d..caced471e 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -17,7 +17,7 @@ from comfy_api_nodes.util import ( poll_op, sync_op, upload_images_to_comfyapi, - validate_image_aspect_ratio_range, + validate_image_aspect_ratio, validate_image_dimensions, validate_string, ) @@ -403,7 +403,7 @@ class ByteDanceImageEditNode(IO.ComfyNode): validate_string(prompt, strip_whitespace=True, min_length=1) if get_number_of_images(image) != 1: raise ValueError("Exactly one input image is required.") - validate_image_aspect_ratio_range(image, (1, 3), (3, 1)) + validate_image_aspect_ratio(image, (1, 3), (3, 1)) source_url = (await upload_images_to_comfyapi(cls, image, max_images=1, mime_type="image/png"))[0] payload = Image2ImageTaskCreationRequest( model=model, @@ -565,7 +565,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode): reference_images_urls = [] if n_input_images: for i in image: - validate_image_aspect_ratio_range(i, (1, 3), (3, 1)) + validate_image_aspect_ratio(i, (1, 3), (3, 1)) reference_images_urls = await upload_images_to_comfyapi( cls, image, @@ -798,7 +798,7 @@ class ByteDanceImageToVideoNode(IO.ComfyNode): validate_string(prompt, strip_whitespace=True, min_length=1) raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"]) validate_image_dimensions(image, min_width=300, min_height=300, max_width=6000, max_height=6000) - validate_image_aspect_ratio_range(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 + validate_image_aspect_ratio(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 image_url = (await upload_images_to_comfyapi(cls, image, max_images=1))[0] prompt = ( @@ -923,7 +923,7 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode): raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"]) for i in (first_frame, last_frame): validate_image_dimensions(i, min_width=300, min_height=300, max_width=6000, max_height=6000) - validate_image_aspect_ratio_range(i, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 + validate_image_aspect_ratio(i, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 download_urls = await upload_images_to_comfyapi( cls, @@ -1045,7 +1045,7 @@ class ByteDanceImageReferenceNode(IO.ComfyNode): raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "watermark"]) for image in images: validate_image_dimensions(image, min_width=300, min_height=300, max_width=6000, max_height=6000) - validate_image_aspect_ratio_range(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 + validate_image_aspect_ratio(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 image_urls = await upload_images_to_comfyapi(cls, images, max_images=4, mime_type="image/png") prompt = ( diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index d8fd3378b..48f94e612 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -1,6 +1,6 @@ from io import BytesIO from typing_extensions import override -from comfy_api.latest import ComfyExtension, IO +from comfy_api.latest import IO, ComfyExtension from PIL import Image import numpy as np import torch @@ -11,19 +11,13 @@ from comfy_api_nodes.apis import ( IdeogramV3Request, IdeogramV3EditRequest, ) - -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, -) - -from comfy_api_nodes.apinode_utils import ( - download_url_to_bytesio, + bytesio_to_image_tensor, + download_url_as_bytesio, resize_mask_to_image, + sync_op, ) -from comfy_api_nodes.util import bytesio_to_image_tensor -from server import PromptServer V1_V1_RES_MAP = { "Auto":"AUTO", @@ -220,7 +214,7 @@ async def download_and_process_images(image_urls): for image_url in image_urls: # Using functions from apinode_utils.py to handle downloading and processing - image_bytesio = await download_url_to_bytesio(image_url) # Download image content to BytesIO + image_bytesio = await download_url_as_bytesio(image_url) # Download image content to BytesIO img_tensor = bytesio_to_image_tensor(image_bytesio, mode="RGB") # Convert to torch.Tensor with RGB mode image_tensors.append(img_tensor) @@ -233,19 +227,6 @@ async def download_and_process_images(image_urls): return stacked_tensors -def display_image_urls_on_node(image_urls, node_id): - if node_id and image_urls: - if len(image_urls) == 1: - PromptServer.instance.send_progress_text( - f"Generated Image URL:\n{image_urls[0]}", node_id - ) - else: - urls_text = "Generated Image URLs:\n" + "\n".join( - f"{i+1}. {url}" for i, url in enumerate(image_urls) - ) - PromptServer.instance.send_progress_text(urls_text, node_id) - - class IdeogramV1(IO.ComfyNode): @classmethod @@ -334,44 +315,30 @@ class IdeogramV1(IO.ComfyNode): aspect_ratio = V1_V2_RATIO_MAP.get(aspect_ratio, None) model = "V_1_TURBO" if turbo else "V_1" - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/ideogram/generate", - method=HttpMethod.POST, - request_model=IdeogramGenerateRequest, - response_model=IdeogramGenerateResponse, - ), - request=IdeogramGenerateRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/ideogram/generate", method="POST"), + response_model=IdeogramGenerateResponse, + data=IdeogramGenerateRequest( image_request=ImageRequest( prompt=prompt, model=model, num_images=num_images, seed=seed, aspect_ratio=aspect_ratio if aspect_ratio != "ASPECT_1_1" else None, - magic_prompt_option=( - magic_prompt_option if magic_prompt_option != "AUTO" else None - ), + magic_prompt_option=(magic_prompt_option if magic_prompt_option != "AUTO" else None), negative_prompt=negative_prompt if negative_prompt else None, ) ), - auth_kwargs=auth, + max_retries=1, ) - response = await operation.execute() - if not response.data or len(response.data) == 0: raise Exception("No images were generated in the response") image_urls = [image_data.url for image_data in response.data if image_data.url] - if not image_urls: raise Exception("No image URLs were generated in the response") - - display_image_urls_on_node(image_urls, cls.hidden.unique_id) return IO.NodeOutput(await download_and_process_images(image_urls)) @@ -500,18 +467,11 @@ class IdeogramV2(IO.ComfyNode): else: final_aspect_ratio = aspect_ratio if aspect_ratio != "ASPECT_1_1" else None - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/ideogram/generate", - method=HttpMethod.POST, - request_model=IdeogramGenerateRequest, - response_model=IdeogramGenerateResponse, - ), - request=IdeogramGenerateRequest( + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/ideogram/generate", method="POST"), + response_model=IdeogramGenerateResponse, + data=IdeogramGenerateRequest( image_request=ImageRequest( prompt=prompt, model=model, @@ -519,28 +479,20 @@ class IdeogramV2(IO.ComfyNode): seed=seed, aspect_ratio=final_aspect_ratio, resolution=final_resolution, - magic_prompt_option=( - magic_prompt_option if magic_prompt_option != "AUTO" else None - ), + magic_prompt_option=(magic_prompt_option if magic_prompt_option != "AUTO" else None), style_type=style_type if style_type != "NONE" else None, negative_prompt=negative_prompt if negative_prompt else None, color_palette=color_palette if color_palette else None, ) ), - auth_kwargs=auth, + max_retries=1, ) - - response = await operation.execute() - if not response.data or len(response.data) == 0: raise Exception("No images were generated in the response") image_urls = [image_data.url for image_data in response.data if image_data.url] - if not image_urls: raise Exception("No image URLs were generated in the response") - - display_image_urls_on_node(image_urls, cls.hidden.unique_id) return IO.NodeOutput(await download_and_process_images(image_urls)) @@ -656,10 +608,6 @@ class IdeogramV3(IO.ComfyNode): character_image=None, character_mask=None, ): - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } if rendering_speed == "BALANCED": # for backward compatibility rendering_speed = "DEFAULT" @@ -694,9 +642,6 @@ class IdeogramV3(IO.ComfyNode): # Check if both image and mask are provided for editing mode if image is not None and mask is not None: - # Edit mode - path = "/proxy/ideogram/ideogram-v3/edit" - # Process image and mask input_tensor = image.squeeze().cpu() # Resize mask to match image dimension @@ -749,27 +694,20 @@ class IdeogramV3(IO.ComfyNode): if character_mask_binary: files["character_mask_binary"] = character_mask_binary - # Execute the operation for edit mode - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=IdeogramV3EditRequest, - response_model=IdeogramGenerateResponse, - ), - request=edit_request, + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/ideogram/ideogram-v3/edit", method="POST"), + response_model=IdeogramGenerateResponse, + data=edit_request, files=files, content_type="multipart/form-data", - auth_kwargs=auth, + max_retries=1, ) elif image is not None or mask is not None: # If only one of image or mask is provided, raise an error raise Exception("Ideogram V3 image editing requires both an image AND a mask") else: - # Generation mode - path = "/proxy/ideogram/ideogram-v3/generate" - # Create generation request gen_request = IdeogramV3Request( prompt=prompt, @@ -800,32 +738,22 @@ class IdeogramV3(IO.ComfyNode): if files: gen_request.style_type = "AUTO" - # Execute the operation for generation mode - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=IdeogramV3Request, - response_model=IdeogramGenerateResponse, - ), - request=gen_request, + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/ideogram/ideogram-v3/generate", method="POST"), + response_model=IdeogramGenerateResponse, + data=gen_request, files=files if files else None, content_type="multipart/form-data", - auth_kwargs=auth, + max_retries=1, ) - # Execute the operation and process response - response = await operation.execute() - if not response.data or len(response.data) == 0: raise Exception("No images were generated in the response") image_urls = [image_data.url for image_data in response.data if image_data.url] - if not image_urls: raise Exception("No image URLs were generated in the response") - - display_image_urls_on_node(image_urls, cls.hidden.unique_id) return IO.NodeOutput(await download_and_process_images(image_urls)) @@ -838,5 +766,6 @@ class IdeogramExtension(ComfyExtension): IdeogramV3, ] + async def comfy_entrypoint() -> IdeogramExtension: return IdeogramExtension() diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index eea65c9ac..7b23e9cf9 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -282,7 +282,7 @@ def validate_input_image(image: torch.Tensor) -> None: See: https://app.klingai.com/global/dev/document-api/apiReference/model/imageToVideo """ validate_image_dimensions(image, min_width=300, min_height=300) - validate_image_aspect_ratio(image, min_aspect_ratio=1 / 2.5, max_aspect_ratio=2.5) + validate_image_aspect_ratio(image, (1, 2.5), (2.5, 1)) def get_video_from_response(response) -> KlingVideoResult: diff --git a/comfy_api_nodes/nodes_pixverse.py b/comfy_api_nodes/nodes_pixverse.py index b2b841be8..6e1686af0 100644 --- a/comfy_api_nodes/nodes_pixverse.py +++ b/comfy_api_nodes/nodes_pixverse.py @@ -1,7 +1,6 @@ -from inspect import cleandoc -from typing import Optional +import torch from typing_extensions import override -from io import BytesIO +from comfy_api.latest import IO, ComfyExtension from comfy_api_nodes.apis.pixverse_api import ( PixverseTextVideoRequest, PixverseImageVideoRequest, @@ -17,53 +16,30 @@ from comfy_api_nodes.apis.pixverse_api import ( PixverseIO, pixverse_templates, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, + download_url_to_video_output, + poll_op, + sync_op, + tensor_to_bytesio, + validate_string, ) -from comfy_api_nodes.util import validate_string, tensor_to_bytesio -from comfy_api.input_impl import VideoFromFile -from comfy_api.latest import ComfyExtension, IO - -import torch -import aiohttp - AVERAGE_DURATION_T2V = 32 AVERAGE_DURATION_I2V = 30 AVERAGE_DURATION_T2T = 52 -def get_video_url_from_response( - response: PixverseGenerationStatusResponse, -) -> Optional[str]: - if response.Resp is None or response.Resp.url is None: - return None - return str(response.Resp.url) - - -async def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None): - # first, upload image to Pixverse and get image id to use in actual generation call - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/pixverse/image/upload", - method=HttpMethod.POST, - request_model=EmptyRequest, - response_model=PixverseImageUploadResponse, - ), - request=EmptyRequest(), +async def upload_image_to_pixverse(cls: type[IO.ComfyNode], image: torch.Tensor): + response_upload = await sync_op( + cls, + ApiEndpoint(path="/proxy/pixverse/image/upload", method="POST"), + response_model=PixverseImageUploadResponse, files={"image": tensor_to_bytesio(image)}, content_type="multipart/form-data", - auth_kwargs=auth_kwargs, ) - response_upload: PixverseImageUploadResponse = await operation.execute() - if response_upload.Resp is None: raise Exception(f"PixVerse image upload request failed: '{response_upload.ErrMsg}'") - return response_upload.Resp.img_id @@ -93,17 +69,13 @@ class PixverseTemplateNode(IO.ComfyNode): class PixverseTextToVideoNode(IO.ComfyNode): - """ - Generates videos based on prompt and output_size. - """ - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="PixverseTextToVideoNode", display_name="PixVerse Text to Video", category="api node/video/PixVerse", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos based on prompt and output_size.", inputs=[ IO.String.Input( "prompt", @@ -170,7 +142,7 @@ class PixverseTextToVideoNode(IO.ComfyNode): negative_prompt: str = None, pixverse_template: int = None, ) -> IO.NodeOutput: - validate_string(prompt, strip_whitespace=False) + validate_string(prompt, strip_whitespace=False, min_length=1) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration if quality == PixverseQuality.res_1080p: @@ -179,18 +151,11 @@ class PixverseTextToVideoNode(IO.ComfyNode): elif duration_seconds != PixverseDuration.dur_5: motion_mode = PixverseMotionMode.normal - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/pixverse/video/text/generate", - method=HttpMethod.POST, - request_model=PixverseTextVideoRequest, - response_model=PixverseVideoResponse, - ), - request=PixverseTextVideoRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/pixverse/video/text/generate", method="POST"), + response_model=PixverseVideoResponse, + data=PixverseTextVideoRequest( prompt=prompt, aspect_ratio=aspect_ratio, quality=quality, @@ -200,20 +165,14 @@ class PixverseTextToVideoNode(IO.ComfyNode): template_id=pixverse_template, seed=seed, ), - auth_kwargs=auth, ) - response_api = await operation.execute() - if response_api.Resp is None: raise Exception(f"PixVerse request failed: '{response_api.ErrMsg}'") - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/pixverse/video/result/{response_api.Resp.video_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=PixverseGenerationStatusResponse, - ), + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/pixverse/video/result/{response_api.Resp.video_id}"), + response_model=PixverseGenerationStatusResponse, completed_statuses=[PixverseStatus.successful], failed_statuses=[ PixverseStatus.contents_moderation, @@ -221,30 +180,19 @@ class PixverseTextToVideoNode(IO.ComfyNode): PixverseStatus.deleted, ], status_extractor=lambda x: x.Resp.status, - auth_kwargs=auth, - node_id=cls.hidden.unique_id, - result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_T2V, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.Resp.url) as vid_response: - return IO.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) + return IO.NodeOutput(await download_url_to_video_output(response_poll.Resp.url)) class PixverseImageToVideoNode(IO.ComfyNode): - """ - Generates videos based on prompt and output_size. - """ - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="PixverseImageToVideoNode", display_name="PixVerse Image to Video", category="api node/video/PixVerse", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos based on prompt and output_size.", inputs=[ IO.Image.Input("image"), IO.String.Input( @@ -309,11 +257,7 @@ class PixverseImageToVideoNode(IO.ComfyNode): pixverse_template: int = None, ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - img_id = await upload_image_to_pixverse(image, auth_kwargs=auth) + img_id = await upload_image_to_pixverse(cls, image) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration @@ -323,14 +267,11 @@ class PixverseImageToVideoNode(IO.ComfyNode): elif duration_seconds != PixverseDuration.dur_5: motion_mode = PixverseMotionMode.normal - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/pixverse/video/img/generate", - method=HttpMethod.POST, - request_model=PixverseImageVideoRequest, - response_model=PixverseVideoResponse, - ), - request=PixverseImageVideoRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/pixverse/video/img/generate", method="POST"), + response_model=PixverseVideoResponse, + data=PixverseImageVideoRequest( img_id=img_id, prompt=prompt, quality=quality, @@ -340,20 +281,15 @@ class PixverseImageToVideoNode(IO.ComfyNode): template_id=pixverse_template, seed=seed, ), - auth_kwargs=auth, ) - response_api = await operation.execute() if response_api.Resp is None: raise Exception(f"PixVerse request failed: '{response_api.ErrMsg}'") - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/pixverse/video/result/{response_api.Resp.video_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=PixverseGenerationStatusResponse, - ), + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/pixverse/video/result/{response_api.Resp.video_id}"), + response_model=PixverseGenerationStatusResponse, completed_statuses=[PixverseStatus.successful], failed_statuses=[ PixverseStatus.contents_moderation, @@ -361,30 +297,19 @@ class PixverseImageToVideoNode(IO.ComfyNode): PixverseStatus.deleted, ], status_extractor=lambda x: x.Resp.status, - auth_kwargs=auth, - node_id=cls.hidden.unique_id, - result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_I2V, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.Resp.url) as vid_response: - return IO.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) + return IO.NodeOutput(await download_url_to_video_output(response_poll.Resp.url)) class PixverseTransitionVideoNode(IO.ComfyNode): - """ - Generates videos based on prompt and output_size. - """ - @classmethod def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="PixverseTransitionVideoNode", display_name="PixVerse Transition Video", category="api node/video/PixVerse", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos based on prompt and output_size.", inputs=[ IO.Image.Input("first_frame"), IO.Image.Input("last_frame"), @@ -445,12 +370,8 @@ class PixverseTransitionVideoNode(IO.ComfyNode): negative_prompt: str = None, ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - first_frame_id = await upload_image_to_pixverse(first_frame, auth_kwargs=auth) - last_frame_id = await upload_image_to_pixverse(last_frame, auth_kwargs=auth) + first_frame_id = await upload_image_to_pixverse(cls, first_frame) + last_frame_id = await upload_image_to_pixverse(cls, last_frame) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration @@ -460,14 +381,11 @@ class PixverseTransitionVideoNode(IO.ComfyNode): elif duration_seconds != PixverseDuration.dur_5: motion_mode = PixverseMotionMode.normal - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/pixverse/video/transition/generate", - method=HttpMethod.POST, - request_model=PixverseTransitionVideoRequest, - response_model=PixverseVideoResponse, - ), - request=PixverseTransitionVideoRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/pixverse/video/transition/generate", method="POST"), + response_model=PixverseVideoResponse, + data=PixverseTransitionVideoRequest( first_frame_img=first_frame_id, last_frame_img=last_frame_id, prompt=prompt, @@ -477,20 +395,15 @@ class PixverseTransitionVideoNode(IO.ComfyNode): negative_prompt=negative_prompt if negative_prompt else None, seed=seed, ), - auth_kwargs=auth, ) - response_api = await operation.execute() if response_api.Resp is None: raise Exception(f"PixVerse request failed: '{response_api.ErrMsg}'") - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/pixverse/video/result/{response_api.Resp.video_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=PixverseGenerationStatusResponse, - ), + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/pixverse/video/result/{response_api.Resp.video_id}"), + response_model=PixverseGenerationStatusResponse, completed_statuses=[PixverseStatus.successful], failed_statuses=[ PixverseStatus.contents_moderation, @@ -498,16 +411,9 @@ class PixverseTransitionVideoNode(IO.ComfyNode): PixverseStatus.deleted, ], status_extractor=lambda x: x.Resp.status, - auth_kwargs=auth, - node_id=cls.hidden.unique_id, - result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_T2V, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.Resp.url) as vid_response: - return IO.NodeOutput(VideoFromFile(BytesIO(await vid_response.content.read()))) + return IO.NodeOutput(await download_url_to_video_output(response_poll.Resp.url)) class PixVerseExtension(ComfyExtension): diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index dee186cd6..e3440b946 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -8,9 +8,6 @@ from typing_extensions import override from comfy.utils import ProgressBar from comfy_api.latest import IO, ComfyExtension -from comfy_api_nodes.apinode_utils import ( - resize_mask_to_image, -) from comfy_api_nodes.apis.recraft_api import ( RecraftColor, RecraftColorChain, @@ -28,6 +25,7 @@ from comfy_api_nodes.util import ( ApiEndpoint, bytesio_to_image_tensor, download_url_as_bytesio, + resize_mask_to_image, sync_op, tensor_to_bytesio, validate_string, diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py index 0543d1d0e..2fdafbbfe 100644 --- a/comfy_api_nodes/nodes_runway.py +++ b/comfy_api_nodes/nodes_runway.py @@ -200,7 +200,7 @@ class RunwayImageToVideoNodeGen3a(IO.ComfyNode): ) -> IO.NodeOutput: validate_string(prompt, min_length=1) validate_image_dimensions(start_frame, max_width=7999, max_height=7999) - validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + validate_image_aspect_ratio(start_frame, (1, 2), (2, 1)) download_urls = await upload_images_to_comfyapi( cls, @@ -290,7 +290,7 @@ class RunwayImageToVideoNodeGen4(IO.ComfyNode): ) -> IO.NodeOutput: validate_string(prompt, min_length=1) validate_image_dimensions(start_frame, max_width=7999, max_height=7999) - validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + validate_image_aspect_ratio(start_frame, (1, 2), (2, 1)) download_urls = await upload_images_to_comfyapi( cls, @@ -390,8 +390,8 @@ class RunwayFirstLastFrameNode(IO.ComfyNode): validate_string(prompt, min_length=1) validate_image_dimensions(start_frame, max_width=7999, max_height=7999) validate_image_dimensions(end_frame, max_width=7999, max_height=7999) - validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) - validate_image_aspect_ratio(end_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + validate_image_aspect_ratio(start_frame, (1, 2), (2, 1)) + validate_image_aspect_ratio(end_frame, (1, 2), (2, 1)) stacked_input_images = image_tensor_pair_to_batch(start_frame, end_frame) download_urls = await upload_images_to_comfyapi( @@ -475,7 +475,7 @@ class RunwayTextToImageNode(IO.ComfyNode): reference_images = None if reference_image is not None: validate_image_dimensions(reference_image, max_width=7999, max_height=7999) - validate_image_aspect_ratio(reference_image, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + validate_image_aspect_ratio(reference_image, (1, 2), (2, 1)) download_urls = await upload_images_to_comfyapi( cls, reference_image, diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py index 0e0572f8c..7a679f0d9 100644 --- a/comfy_api_nodes/nodes_vidu.py +++ b/comfy_api_nodes/nodes_vidu.py @@ -14,9 +14,9 @@ from comfy_api_nodes.util import ( poll_op, sync_op, upload_images_to_comfyapi, - validate_aspect_ratio_closeness, - validate_image_aspect_ratio_range, + validate_image_aspect_ratio, validate_image_dimensions, + validate_images_aspect_ratio_closeness, ) VIDU_TEXT_TO_VIDEO = "/proxy/vidu/text2video" @@ -114,7 +114,7 @@ async def execute_task( cls, ApiEndpoint(path=VIDU_GET_GENERATION_STATUS % response.task_id), response_model=TaskStatusResponse, - status_extractor=lambda r: r.state.value, + status_extractor=lambda r: r.state, estimated_duration=estimated_duration, ) @@ -307,7 +307,7 @@ class ViduImageToVideoNode(IO.ComfyNode): ) -> IO.NodeOutput: if get_number_of_images(image) > 1: raise ValueError("Only one input image is allowed.") - validate_image_aspect_ratio_range(image, (1, 4), (4, 1)) + validate_image_aspect_ratio(image, (1, 4), (4, 1)) payload = TaskCreationRequest( model_name=model, prompt=prompt, @@ -423,7 +423,7 @@ class ViduReferenceVideoNode(IO.ComfyNode): if a > 7: raise ValueError("Too many images, maximum allowed is 7.") for image in images: - validate_image_aspect_ratio_range(image, (1, 4), (4, 1)) + validate_image_aspect_ratio(image, (1, 4), (4, 1)) validate_image_dimensions(image, min_width=128, min_height=128) payload = TaskCreationRequest( model_name=model, @@ -533,7 +533,7 @@ class ViduStartEndToVideoNode(IO.ComfyNode): resolution: str, movement_amplitude: str, ) -> IO.NodeOutput: - validate_aspect_ratio_closeness(first_frame, end_frame, min_rel=0.8, max_rel=1.25, strict=False) + validate_images_aspect_ratio_closeness(first_frame, end_frame, min_rel=0.8, max_rel=1.25, strict=False) payload = TaskCreationRequest( model_name=model, prompt=prompt, diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py index 0cca2b59b..bbc71363a 100644 --- a/comfy_api_nodes/util/__init__.py +++ b/comfy_api_nodes/util/__init__.py @@ -14,6 +14,7 @@ from .conversions import ( downscale_image_tensor, image_tensor_pair_to_batch, pil_to_bytesio, + resize_mask_to_image, tensor_to_base64_string, tensor_to_bytesio, tensor_to_pil, @@ -34,12 +35,12 @@ from .upload_helpers import ( ) from .validation_utils import ( get_number_of_images, - validate_aspect_ratio_closeness, + validate_aspect_ratio_string, validate_audio_duration, validate_container_format_is_mp4, validate_image_aspect_ratio, - validate_image_aspect_ratio_range, validate_image_dimensions, + validate_images_aspect_ratio_closeness, validate_string, validate_video_dimensions, validate_video_duration, @@ -70,6 +71,7 @@ __all__ = [ "downscale_image_tensor", "image_tensor_pair_to_batch", "pil_to_bytesio", + "resize_mask_to_image", "tensor_to_base64_string", "tensor_to_bytesio", "tensor_to_pil", @@ -77,12 +79,12 @@ __all__ = [ "video_to_base64_string", # Validation utilities "get_number_of_images", - "validate_aspect_ratio_closeness", + "validate_aspect_ratio_string", "validate_audio_duration", "validate_container_format_is_mp4", "validate_image_aspect_ratio", - "validate_image_aspect_ratio_range", "validate_image_dimensions", + "validate_images_aspect_ratio_closeness", "validate_string", "validate_video_dimensions", "validate_video_duration", diff --git a/comfy_api_nodes/util/conversions.py b/comfy_api_nodes/util/conversions.py index 9f4c90c5c..b59c2bd84 100644 --- a/comfy_api_nodes/util/conversions.py +++ b/comfy_api_nodes/util/conversions.py @@ -430,3 +430,24 @@ def audio_bytes_to_audio_input(audio_bytes: bytes) -> dict: wav = torch.cat(frames, dim=1) # [C, T] wav = _f32_pcm(wav) return {"waveform": wav.unsqueeze(0).contiguous(), "sample_rate": out_sr} + + +def resize_mask_to_image( + mask: torch.Tensor, + image: torch.Tensor, + upscale_method="nearest-exact", + crop="disabled", + allow_gradient=True, + add_channel_dim=False, +): + """Resize mask to be the same dimensions as an image, while maintaining proper format for API calls.""" + _, height, width, _ = image.shape + mask = mask.unsqueeze(-1) + mask = mask.movedim(-1, 1) + mask = common_upscale(mask, width=width, height=height, upscale_method=upscale_method, crop=crop) + mask = mask.movedim(1, -1) + if not add_channel_dim: + mask = mask.squeeze(-1) + if not allow_gradient: + mask = (mask > 0.5).float() + return mask diff --git a/comfy_api_nodes/util/validation_utils.py b/comfy_api_nodes/util/validation_utils.py index 22da05bc1..ec7006aed 100644 --- a/comfy_api_nodes/util/validation_utils.py +++ b/comfy_api_nodes/util/validation_utils.py @@ -37,63 +37,62 @@ def validate_image_dimensions( def validate_image_aspect_ratio( image: torch.Tensor, - min_aspect_ratio: Optional[float] = None, - max_aspect_ratio: Optional[float] = None, -): - width, height = get_image_dimensions(image) - aspect_ratio = width / height - - if min_aspect_ratio is not None and aspect_ratio < min_aspect_ratio: - raise ValueError(f"Image aspect ratio must be at least {min_aspect_ratio}, got {aspect_ratio}") - if max_aspect_ratio is not None and aspect_ratio > max_aspect_ratio: - raise ValueError(f"Image aspect ratio must be at most {max_aspect_ratio}, got {aspect_ratio}") - - -def validate_image_aspect_ratio_range( - image: torch.Tensor, - min_ratio: tuple[float, float], # e.g. (1, 4) - max_ratio: tuple[float, float], # e.g. (4, 1) + min_ratio: Optional[tuple[float, float]] = None, # e.g. (1, 4) + max_ratio: Optional[tuple[float, float]] = None, # e.g. (4, 1) *, strict: bool = True, # True -> (min, max); False -> [min, max] ) -> float: - a1, b1 = min_ratio - a2, b2 = max_ratio - if a1 <= 0 or b1 <= 0 or a2 <= 0 or b2 <= 0: - raise ValueError("Ratios must be positive, like (1, 4) or (4, 1).") - lo, hi = (a1 / b1), (a2 / b2) - if lo > hi: - lo, hi = hi, lo - a1, b1, a2, b2 = a2, b2, a1, b1 # swap only for error text + """Validates that image aspect ratio is within min and max. If a bound is None, that side is not checked.""" w, h = get_image_dimensions(image) if w <= 0 or h <= 0: raise ValueError(f"Invalid image dimensions: {w}x{h}") ar = w / h - ok = (lo < ar < hi) if strict else (lo <= ar <= hi) - if not ok: - op = "<" if strict else "≤" - raise ValueError(f"Image aspect ratio {ar:.6g} is outside allowed range: {a1}:{b1} {op} ratio {op} {a2}:{b2}") + _assert_ratio_bounds(ar, min_ratio=min_ratio, max_ratio=max_ratio, strict=strict) return ar -def validate_aspect_ratio_closeness( - start_img, - end_img, - min_rel: float, - max_rel: float, +def validate_images_aspect_ratio_closeness( + first_image: torch.Tensor, + second_image: torch.Tensor, + min_rel: float, # e.g. 0.8 + max_rel: float, # e.g. 1.25 *, - strict: bool = False, # True => exclusive, False => inclusive -) -> None: - w1, h1 = get_image_dimensions(start_img) - w2, h2 = get_image_dimensions(end_img) + strict: bool = False, # True -> (min, max); False -> [min, max] +) -> float: + """ + Validates that the two images' aspect ratios are 'close'. + The closeness factor is C = max(ar1, ar2) / min(ar1, ar2) (C >= 1). + We require C <= limit, where limit = max(max_rel, 1.0 / min_rel). + + Returns the computed closeness factor C. + """ + w1, h1 = get_image_dimensions(first_image) + w2, h2 = get_image_dimensions(second_image) if min(w1, h1, w2, h2) <= 0: raise ValueError("Invalid image dimensions") ar1 = w1 / h1 ar2 = w2 / h2 - # Normalize so it is symmetric (no need to check both ar1/ar2 and ar2/ar1) closeness = max(ar1, ar2) / min(ar1, ar2) - limit = max(max_rel, 1.0 / min_rel) # for 0.8..1.25 this is 1.25 + limit = max(max_rel, 1.0 / min_rel) if (closeness >= limit) if strict else (closeness > limit): - raise ValueError(f"Aspect ratios must be close: start/end={ar1/ar2:.4f}, allowed range {min_rel}–{max_rel}.") + raise ValueError( + f"Aspect ratios must be close: ar1/ar2={ar1/ar2:.2g}, " + f"allowed range {min_rel}–{max_rel} (limit {limit:.2g})." + ) + return closeness + + +def validate_aspect_ratio_string( + aspect_ratio: str, + min_ratio: Optional[tuple[float, float]] = None, # e.g. (1, 4) + max_ratio: Optional[tuple[float, float]] = None, # e.g. (4, 1) + *, + strict: bool = False, # True -> (min, max); False -> [min, max] +) -> float: + """Parses 'X:Y' and validates it against optional bounds. Returns the numeric ratio.""" + ar = _parse_aspect_ratio_string(aspect_ratio) + _assert_ratio_bounds(ar, min_ratio=min_ratio, max_ratio=max_ratio, strict=strict) + return ar def validate_video_dimensions( @@ -183,3 +182,49 @@ def validate_container_format_is_mp4(video: VideoInput) -> None: container_format = video.get_container_format() if container_format not in ["mp4", "mov,mp4,m4a,3gp,3g2,mj2"]: raise ValueError(f"Only MP4 container format supported. Got: {container_format}") + + +def _ratio_from_tuple(r: tuple[float, float]) -> float: + a, b = r + if a <= 0 or b <= 0: + raise ValueError(f"Ratios must be positive, got {a}:{b}.") + return a / b + + +def _assert_ratio_bounds( + ar: float, + *, + min_ratio: Optional[tuple[float, float]] = None, + max_ratio: Optional[tuple[float, float]] = None, + strict: bool = True, +) -> None: + """Validate a numeric aspect ratio against optional min/max ratio bounds.""" + lo = _ratio_from_tuple(min_ratio) if min_ratio is not None else None + hi = _ratio_from_tuple(max_ratio) if max_ratio is not None else None + + if lo is not None and hi is not None and lo > hi: + lo, hi = hi, lo # normalize order if caller swapped them + + if lo is not None: + if (ar <= lo) if strict else (ar < lo): + op = "<" if strict else "≤" + raise ValueError(f"Aspect ratio `{ar:.2g}` must be {op} {lo:.2g}.") + if hi is not None: + if (ar >= hi) if strict else (ar > hi): + op = "<" if strict else "≤" + raise ValueError(f"Aspect ratio `{ar:.2g}` must be {op} {hi:.2g}.") + + +def _parse_aspect_ratio_string(ar_str: str) -> float: + """Parse 'X:Y' with integer parts into a positive float ratio X/Y.""" + parts = ar_str.split(":") + if len(parts) != 2: + raise ValueError(f"Aspect ratio must be 'X:Y' (e.g., 16:9), got '{ar_str}'.") + try: + a = int(parts[0].strip()) + b = int(parts[1].strip()) + except ValueError as exc: + raise ValueError(f"Aspect ratio must contain integers separated by ':', got '{ar_str}'.") from exc + if a <= 0 or b <= 0: + raise ValueError(f"Aspect ratio parts must be positive integers, got {a}:{b}.") + return a / b From dfac94695be95076d8028d04005a744f3ec0de8d Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 30 Oct 2025 19:22:35 +0200 Subject: [PATCH 515/759] fix img2img operation in Dall2 node (#10552) --- comfy_api_nodes/nodes_openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index c467e840c..b4568fc85 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -225,7 +225,7 @@ class OpenAIDalle2(ComfyNodeABC): ), files=( { - "image": img_binary, + "image": ("image.png", img_binary, "image/png"), } if img_binary else None From 513b0c46fba3bf40191d684ff81207ad935f1717 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Fri, 31 Oct 2025 07:39:02 +1000 Subject: [PATCH 516/759] Add RAM Pressure cache mode (#10454) * execution: Roll the UI cache into the outputs Currently the UI cache is parallel to the output cache with expectations of being a content superset of the output cache. At the same time the UI and output cache are maintained completely seperately, making it awkward to free the output cache content without changing the behaviour of the UI cache. There are two actual users (getters) of the UI cache. The first is the case of a direct content hit on the output cache when executing a node. This case is very naturally handled by merging the UI and outputs cache. The second case is the history JSON generation at the end of the prompt. This currently works by asking the cache for all_node_ids and then pulling the cache contents for those nodes. all_node_ids is the nodes of the dynamic prompt. So fold the UI cache into the output cache. The current UI cache setter now writes to a prompt-scope dict. When the output cache is set, just get this value from the dict and tuple up with the outputs. When generating the history, simply iterate prompt-scope dict. This prepares support for more complex caching strategies (like RAM pressure caching) where less than 1 workflow will be cached and it will be desirable to keep the UI cache and output cache in sync. * sd: Implement RAM getter for VAE * model_patcher: Implement RAM getter for ModelPatcher * sd: Implement RAM getter for CLIP * Implement RAM Pressure cache Implement a cache sensitive to RAM pressure. When RAM headroom drops down below a certain threshold, evict RAM-expensive nodes from the cache. Models and tensors are measured directly for RAM usage. An OOM score is then computed based on the RAM usage of the node. Note the due to indirection through shared objects (like a model patcher), multiple nodes can account the same RAM as their individual usage. The intent is this will free chains of nodes particularly model loaders and associate loras as they all score similar and are sorted in close to each other. Has a bias towards unloading model nodes mid flow while being able to keep results like text encodings and VAE. * execution: Convert the cache entry to NamedTuple As commented in review. Convert this to a named tuple and abstract away the tuple type completely from graph.py. --- comfy/cli_args.py | 1 + comfy/model_patcher.py | 3 ++ comfy/sd.py | 14 +++++++ comfy_execution/caching.py | 83 ++++++++++++++++++++++++++++++++++++++ comfy_execution/graph.py | 9 ++++- execution.py | 81 +++++++++++++++++++++---------------- main.py | 4 +- 7 files changed, 157 insertions(+), 38 deletions(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 001abd843..3d5bc7c90 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -105,6 +105,7 @@ cache_group = parser.add_mutually_exclusive_group() cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.") cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.") cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.") +cache_group.add_argument("--cache-ram", nargs='?', const=4.0, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threhold the cache remove large items to free RAM. Default 4GB") attn_group = parser.add_mutually_exclusive_group() attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.") diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index ed3f3f5cb..674a214ca 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -276,6 +276,9 @@ class ModelPatcher: self.size = comfy.model_management.module_size(self.model) return self.size + def get_ram_usage(self): + return self.model_size() + def loaded_size(self): return self.model.model_loaded_weight_memory diff --git a/comfy/sd.py b/comfy/sd.py index de4eee96e..9e5ebbf15 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -143,6 +143,9 @@ class CLIP: n.apply_hooks_to_conds = self.apply_hooks_to_conds return n + def get_ram_usage(self): + return self.patcher.get_ram_usage() + def add_patches(self, patches, strength_patch=1.0, strength_model=1.0): return self.patcher.add_patches(patches, strength_patch, strength_model) @@ -293,6 +296,7 @@ class VAE: self.working_dtypes = [torch.bfloat16, torch.float32] self.disable_offload = False self.not_video = False + self.size = None self.downscale_index_formula = None self.upscale_index_formula = None @@ -595,6 +599,16 @@ class VAE: self.patcher = comfy.model_patcher.ModelPatcher(self.first_stage_model, load_device=self.device, offload_device=offload_device) logging.info("VAE load device: {}, offload device: {}, dtype: {}".format(self.device, offload_device, self.vae_dtype)) + self.model_size() + + def model_size(self): + if self.size is not None: + return self.size + self.size = comfy.model_management.module_size(self.first_stage_model) + return self.size + + def get_ram_usage(self): + return self.model_size() def throw_exception_if_invalid(self): if self.first_stage_model is None: diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py index 566bc3f9c..b498f43e7 100644 --- a/comfy_execution/caching.py +++ b/comfy_execution/caching.py @@ -1,4 +1,9 @@ +import bisect +import gc import itertools +import psutil +import time +import torch from typing import Sequence, Mapping, Dict from comfy_execution.graph import DynamicPrompt from abc import ABC, abstractmethod @@ -188,6 +193,9 @@ class BasicCache: self._clean_cache() self._clean_subcaches() + def poll(self, **kwargs): + pass + def _set_immediate(self, node_id, value): assert self.initialized cache_key = self.cache_key_set.get_data_key(node_id) @@ -276,6 +284,9 @@ class NullCache: def clean_unused(self): pass + def poll(self, **kwargs): + pass + def get(self, node_id): return None @@ -336,3 +347,75 @@ class LRUCache(BasicCache): self._mark_used(child_id) self.children[cache_key].append(self.cache_key_set.get_data_key(child_id)) return self + + +#Iterating the cache for usage analysis might be expensive, so if we trigger make sure +#to take a chunk out to give breathing space on high-node / low-ram-per-node flows. + +RAM_CACHE_HYSTERESIS = 1.1 + +#This is kinda in GB but not really. It needs to be non-zero for the below heuristic +#and as long as Multi GB models dwarf this it will approximate OOM scoring OK + +RAM_CACHE_DEFAULT_RAM_USAGE = 0.1 + +#Exponential bias towards evicting older workflows so garbage will be taken out +#in constantly changing setups. + +RAM_CACHE_OLD_WORKFLOW_OOM_MULTIPLIER = 1.3 + +class RAMPressureCache(LRUCache): + + def __init__(self, key_class): + super().__init__(key_class, 0) + self.timestamps = {} + + def clean_unused(self): + self._clean_subcaches() + + def set(self, node_id, value): + self.timestamps[self.cache_key_set.get_data_key(node_id)] = time.time() + super().set(node_id, value) + + def get(self, node_id): + self.timestamps[self.cache_key_set.get_data_key(node_id)] = time.time() + return super().get(node_id) + + def poll(self, ram_headroom): + def _ram_gb(): + return psutil.virtual_memory().available / (1024**3) + + if _ram_gb() > ram_headroom: + return + gc.collect() + if _ram_gb() > ram_headroom: + return + + clean_list = [] + + for key, (outputs, _), in self.cache.items(): + oom_score = RAM_CACHE_OLD_WORKFLOW_OOM_MULTIPLIER ** (self.generation - self.used_generation[key]) + + ram_usage = RAM_CACHE_DEFAULT_RAM_USAGE + def scan_list_for_ram_usage(outputs): + nonlocal ram_usage + for output in outputs: + if isinstance(output, list): + scan_list_for_ram_usage(output) + elif isinstance(output, torch.Tensor) and output.device.type == 'cpu': + #score Tensors at a 50% discount for RAM usage as they are likely to + #be high value intermediates + ram_usage += (output.numel() * output.element_size()) * 0.5 + elif hasattr(output, "get_ram_usage"): + ram_usage += output.get_ram_usage() + scan_list_for_ram_usage(outputs) + + oom_score *= ram_usage + #In the case where we have no information on the node ram usage at all, + #break OOM score ties on the last touch timestamp (pure LRU) + bisect.insort(clean_list, (oom_score, self.timestamps[key], key)) + + while _ram_gb() < ram_headroom * RAM_CACHE_HYSTERESIS and clean_list: + _, _, key = clean_list.pop() + del self.cache[key] + gc.collect() diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index 341c9735d..0d811e354 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -209,10 +209,15 @@ class ExecutionList(TopologicalSort): self.execution_cache_listeners[from_node_id] = set() self.execution_cache_listeners[from_node_id].add(to_node_id) - def get_output_cache(self, from_node_id, to_node_id): + def get_cache(self, from_node_id, to_node_id): if not to_node_id in self.execution_cache: return None - return self.execution_cache[to_node_id].get(from_node_id) + value = self.execution_cache[to_node_id].get(from_node_id) + if value is None: + return None + #Write back to the main cache on touch. + self.output_cache.set(from_node_id, value) + return value def cache_update(self, node_id, value): if node_id in self.execution_cache_listeners: diff --git a/execution.py b/execution.py index 20e106213..17c77beab 100644 --- a/execution.py +++ b/execution.py @@ -21,6 +21,7 @@ from comfy_execution.caching import ( NullCache, HierarchicalCache, LRUCache, + RAMPressureCache, ) from comfy_execution.graph import ( DynamicPrompt, @@ -88,49 +89,56 @@ class IsChangedCache: return self.is_changed[node_id] +class CacheEntry(NamedTuple): + ui: dict + outputs: list + + class CacheType(Enum): CLASSIC = 0 LRU = 1 NONE = 2 + RAM_PRESSURE = 3 class CacheSet: - def __init__(self, cache_type=None, cache_size=None): + def __init__(self, cache_type=None, cache_args={}): if cache_type == CacheType.NONE: self.init_null_cache() logging.info("Disabling intermediate node cache.") + elif cache_type == CacheType.RAM_PRESSURE: + cache_ram = cache_args.get("ram", 16.0) + self.init_ram_cache(cache_ram) + logging.info("Using RAM pressure cache.") elif cache_type == CacheType.LRU: - if cache_size is None: - cache_size = 0 + cache_size = cache_args.get("lru", 0) self.init_lru_cache(cache_size) logging.info("Using LRU cache") else: self.init_classic_cache() - self.all = [self.outputs, self.ui, self.objects] + self.all = [self.outputs, self.objects] # Performs like the old cache -- dump data ASAP def init_classic_cache(self): self.outputs = HierarchicalCache(CacheKeySetInputSignature) - self.ui = HierarchicalCache(CacheKeySetInputSignature) self.objects = HierarchicalCache(CacheKeySetID) def init_lru_cache(self, cache_size): self.outputs = LRUCache(CacheKeySetInputSignature, max_size=cache_size) - self.ui = LRUCache(CacheKeySetInputSignature, max_size=cache_size) + self.objects = HierarchicalCache(CacheKeySetID) + + def init_ram_cache(self, min_headroom): + self.outputs = RAMPressureCache(CacheKeySetInputSignature) self.objects = HierarchicalCache(CacheKeySetID) def init_null_cache(self): self.outputs = NullCache() - #The UI cache is expected to be iterable at the end of each workflow - #so it must cache at least a full workflow. Use Heirachical - self.ui = HierarchicalCache(CacheKeySetInputSignature) self.objects = NullCache() def recursive_debug_dump(self): result = { "outputs": self.outputs.recursive_debug_dump(), - "ui": self.ui.recursive_debug_dump(), } return result @@ -157,14 +165,14 @@ def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt= if execution_list is None: mark_missing() continue # This might be a lazily-evaluated input - cached_output = execution_list.get_output_cache(input_unique_id, unique_id) - if cached_output is None: + cached = execution_list.get_cache(input_unique_id, unique_id) + if cached is None or cached.outputs is None: mark_missing() continue - if output_index >= len(cached_output): + if output_index >= len(cached.outputs): mark_missing() continue - obj = cached_output[output_index] + obj = cached.outputs[output_index] input_data_all[x] = obj elif input_category is not None: input_data_all[x] = [input_data] @@ -393,7 +401,7 @@ def format_value(x): else: return str(x) -async def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes): +async def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_outputs): unique_id = current_item real_node_id = dynprompt.get_real_node_id(unique_id) display_node_id = dynprompt.get_display_node_id(unique_id) @@ -401,12 +409,15 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, inputs = dynprompt.get_node(unique_id)['inputs'] class_type = dynprompt.get_node(unique_id)['class_type'] class_def = nodes.NODE_CLASS_MAPPINGS[class_type] - if caches.outputs.get(unique_id) is not None: + cached = caches.outputs.get(unique_id) + if cached is not None: if server.client_id is not None: - cached_output = caches.ui.get(unique_id) or {} - server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_output.get("output",None), "prompt_id": prompt_id }, server.client_id) + cached_ui = cached.ui or {} + server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_ui.get("output",None), "prompt_id": prompt_id }, server.client_id) + if cached.ui is not None: + ui_outputs[unique_id] = cached.ui get_progress_state().finish_progress(unique_id) - execution_list.cache_update(unique_id, caches.outputs.get(unique_id)) + execution_list.cache_update(unique_id, cached) return (ExecutionResult.SUCCESS, None, None) input_data_all = None @@ -436,8 +447,8 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, for r in result: if is_link(r): source_node, source_output = r[0], r[1] - node_output = execution_list.get_output_cache(source_node, unique_id)[source_output] - for o in node_output: + node_cached = execution_list.get_cache(source_node, unique_id) + for o in node_cached.outputs[source_output]: resolved_output.append(o) else: @@ -507,7 +518,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, asyncio.create_task(await_completion()) return (ExecutionResult.PENDING, None, None) if len(output_ui) > 0: - caches.ui.set(unique_id, { + ui_outputs[unique_id] = { "meta": { "node_id": unique_id, "display_node": display_node_id, @@ -515,7 +526,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, "real_node_id": real_node_id, }, "output": output_ui - }) + } if server.client_id is not None: server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": output_ui, "prompt_id": prompt_id }, server.client_id) if has_subgraph: @@ -554,8 +565,9 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, pending_subgraph_results[unique_id] = cached_outputs return (ExecutionResult.PENDING, None, None) - caches.outputs.set(unique_id, output_data) - execution_list.cache_update(unique_id, output_data) + cache_entry = CacheEntry(ui=ui_outputs.get(unique_id), outputs=output_data) + execution_list.cache_update(unique_id, cache_entry) + caches.outputs.set(unique_id, cache_entry) except comfy.model_management.InterruptProcessingException as iex: logging.info("Processing interrupted") @@ -600,14 +612,14 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, return (ExecutionResult.SUCCESS, None, None) class PromptExecutor: - def __init__(self, server, cache_type=False, cache_size=None): - self.cache_size = cache_size + def __init__(self, server, cache_type=False, cache_args=None): + self.cache_args = cache_args self.cache_type = cache_type self.server = server self.reset() def reset(self): - self.caches = CacheSet(cache_type=self.cache_type, cache_size=self.cache_size) + self.caches = CacheSet(cache_type=self.cache_type, cache_args=self.cache_args) self.status_messages = [] self.success = True @@ -682,6 +694,7 @@ class PromptExecutor: broadcast=False) pending_subgraph_results = {} pending_async_nodes = {} # TODO - Unify this with pending_subgraph_results + ui_node_outputs = {} executed = set() execution_list = ExecutionList(dynamic_prompt, self.caches.outputs) current_outputs = self.caches.outputs.all_node_ids() @@ -695,7 +708,7 @@ class PromptExecutor: break assert node_id is not None, "Node ID should not be None at this point" - result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes) + result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_node_outputs) self.success = result != ExecutionResult.FAILURE if result == ExecutionResult.FAILURE: self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex) @@ -704,18 +717,16 @@ class PromptExecutor: execution_list.unstage_node_execution() else: # result == ExecutionResult.SUCCESS: execution_list.complete_node_execution() + self.caches.outputs.poll(ram_headroom=self.cache_args["ram"]) else: # Only execute when the while-loop ends without break self.add_message("execution_success", { "prompt_id": prompt_id }, broadcast=False) ui_outputs = {} meta_outputs = {} - all_node_ids = self.caches.ui.all_node_ids() - for node_id in all_node_ids: - ui_info = self.caches.ui.get(node_id) - if ui_info is not None: - ui_outputs[node_id] = ui_info["output"] - meta_outputs[node_id] = ui_info["meta"] + for node_id, ui_info in ui_node_outputs.items(): + ui_outputs[node_id] = ui_info["output"] + meta_outputs[node_id] = ui_info["meta"] self.history_result = { "outputs": ui_outputs, "meta": meta_outputs, diff --git a/main.py b/main.py index 8d466d2eb..e1b0f1620 100644 --- a/main.py +++ b/main.py @@ -172,10 +172,12 @@ def prompt_worker(q, server_instance): cache_type = execution.CacheType.CLASSIC if args.cache_lru > 0: cache_type = execution.CacheType.LRU + elif args.cache_ram > 0: + cache_type = execution.CacheType.RAM_PRESSURE elif args.cache_none: cache_type = execution.CacheType.NONE - e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_size=args.cache_lru) + e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : args.cache_ram } ) last_gc_collect = 0 need_gc = False gc_collect_interval = 10.0 From 614cf9805e1056216487a2d1b1a07206d77f87e7 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 30 Oct 2025 19:11:38 -0700 Subject: [PATCH 517/759] Add a ScaleROPE node. Currently only works on WAN models. (#10559) --- comfy/ldm/wan/model.py | 20 ++++++++++++---- comfy/model_patcher.py | 13 +++++++++++ comfy_extras/nodes_rope.py | 47 ++++++++++++++++++++++++++++++++++++++ nodes.py | 1 + 4 files changed, 77 insertions(+), 4 deletions(-) create mode 100644 comfy_extras/nodes_rope.py diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 90c347d3d..77876c2e7 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -588,7 +588,7 @@ class WanModel(torch.nn.Module): x = self.unpatchify(x, grid_sizes) return x - def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None): + def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, transformer_options={}): patch_size = self.patch_size t_len = ((t + (patch_size[0] // 2)) // patch_size[0]) h_len = ((h + (patch_size[1] // 2)) // patch_size[1]) @@ -601,10 +601,22 @@ class WanModel(torch.nn.Module): if steps_w is None: steps_w = w_len + h_start = 0 + w_start = 0 + rope_options = transformer_options.get("rope_options", None) + if rope_options is not None: + t_len = t_len * rope_options.get("scale_t", 1.0) + h_len = h_len * rope_options.get("scale_y", 1.0) + w_len = w_len * rope_options.get("scale_x", 1.0) + + t_start += rope_options.get("shift_t", 0.0) + h_start += rope_options.get("shift_y", 0.0) + w_start += rope_options.get("shift_x", 0.0) + img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype) img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1) - img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1) - img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1) + img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(h_start, h_start + (h_len - 1), steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1) + img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(w_start, w_start + (w_len - 1), steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1) img_ids = img_ids.reshape(1, -1, img_ids.shape[-1]) freqs = self.rope_embedder(img_ids).movedim(1, 2) @@ -630,7 +642,7 @@ class WanModel(torch.nn.Module): if self.ref_conv is not None and "reference_latent" in kwargs: t_len += 1 - freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype) + freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options) return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options, **kwargs)[:, :, :t, :h, :w] def unpatchify(self, x, grid_sizes): diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 674a214ca..3e8799983 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -454,6 +454,19 @@ class ModelPatcher: def set_model_post_input_patch(self, patch): self.set_model_patch(patch, "post_input") + def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs): + rope_options = self.model_options["transformer_options"].get("rope_options", {}) + rope_options["scale_x"] = scale_x + rope_options["scale_y"] = scale_y + rope_options["scale_t"] = scale_t + + rope_options["shift_x"] = shift_x + rope_options["shift_y"] = shift_y + rope_options["shift_t"] = shift_t + + self.model_options["transformer_options"]["rope_options"] = rope_options + + def add_object_patch(self, name, obj): self.object_patches[name] = obj diff --git a/comfy_extras/nodes_rope.py b/comfy_extras/nodes_rope.py new file mode 100644 index 000000000..d1feb031e --- /dev/null +++ b/comfy_extras/nodes_rope.py @@ -0,0 +1,47 @@ +from comfy_api.latest import ComfyExtension, io +from typing_extensions import override + + +class ScaleROPE(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="ScaleROPE", + category="advanced/model_patches", + description="Scale and shift the ROPE of the model.", + is_experimental=True, + inputs=[ + io.Model.Input("model"), + io.Float.Input("scale_x", default=1.0, min=0.0, max=100.0, step=0.1), + io.Float.Input("shift_x", default=0.0, min=-256.0, max=256.0, step=0.1), + + io.Float.Input("scale_y", default=1.0, min=0.0, max=100.0, step=0.1), + io.Float.Input("shift_y", default=0.0, min=-256.0, max=256.0, step=0.1), + + io.Float.Input("scale_t", default=1.0, min=0.0, max=100.0, step=0.1), + io.Float.Input("shift_t", default=0.0, min=-256.0, max=256.0, step=0.1), + + + ], + outputs=[ + io.Model.Output(), + ], + ) + + @classmethod + def execute(cls, model, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t) -> io.NodeOutput: + m = model.clone() + m.set_model_rope_options(scale_x, shift_x, scale_y, shift_y, scale_t, shift_t) + return io.NodeOutput(m) + + +class RopeExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + ScaleROPE + ] + + +async def comfy_entrypoint() -> RopeExtension: + return RopeExtension() diff --git a/nodes.py b/nodes.py index 12e365ca9..5689f6fe1 100644 --- a/nodes.py +++ b/nodes.py @@ -2329,6 +2329,7 @@ async def init_builtin_extra_nodes(): "nodes_model_patch.py", "nodes_easycache.py", "nodes_audio_encoder.py", + "nodes_rope.py", ] import_failed = [] From 27d1bd882925e3bbdffb405cea098ac52bb20ac5 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 30 Oct 2025 19:51:58 -0700 Subject: [PATCH 518/759] Fix rope scaling. (#10560) --- comfy/ldm/wan/model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 77876c2e7..5ec1511ce 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -605,9 +605,9 @@ class WanModel(torch.nn.Module): w_start = 0 rope_options = transformer_options.get("rope_options", None) if rope_options is not None: - t_len = t_len * rope_options.get("scale_t", 1.0) - h_len = h_len * rope_options.get("scale_y", 1.0) - w_len = w_len * rope_options.get("scale_x", 1.0) + t_len = (t_len - 1.0) * rope_options.get("scale_t", 1.0) + 1.0 + h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0 + w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0 t_start += rope_options.get("shift_t", 0.0) h_start += rope_options.get("shift_y", 0.0) From 7f374e42c833c69c71605507b90f79cc26d14a71 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 31 Oct 2025 12:41:40 -0700 Subject: [PATCH 519/759] ScaleROPE now works on Lumina models. (#10578) --- comfy/ldm/lumina/model.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index f87d98ac0..b4494a51d 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -522,7 +522,7 @@ class NextDiT(nn.Module): max_cap_len = max(l_effective_cap_len) max_img_len = max(l_effective_img_len) - position_ids = torch.zeros(bsz, max_seq_len, 3, dtype=torch.int32, device=device) + position_ids = torch.zeros(bsz, max_seq_len, 3, dtype=torch.float32, device=device) for i in range(bsz): cap_len = l_effective_cap_len[i] @@ -531,10 +531,22 @@ class NextDiT(nn.Module): H_tokens, W_tokens = H // pH, W // pW assert H_tokens * W_tokens == img_len - position_ids[i, :cap_len, 0] = torch.arange(cap_len, dtype=torch.int32, device=device) + rope_options = transformer_options.get("rope_options", None) + h_scale = 1.0 + w_scale = 1.0 + h_start = 0 + w_start = 0 + if rope_options is not None: + h_scale = rope_options.get("scale_y", 1.0) + w_scale = rope_options.get("scale_x", 1.0) + + h_start = rope_options.get("shift_y", 0.0) + w_start = rope_options.get("shift_x", 0.0) + + position_ids[i, :cap_len, 0] = torch.arange(cap_len, dtype=torch.float32, device=device) position_ids[i, cap_len:cap_len+img_len, 0] = cap_len - row_ids = torch.arange(H_tokens, dtype=torch.int32, device=device).view(-1, 1).repeat(1, W_tokens).flatten() - col_ids = torch.arange(W_tokens, dtype=torch.int32, device=device).view(1, -1).repeat(H_tokens, 1).flatten() + row_ids = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten() + col_ids = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten() position_ids[i, cap_len:cap_len+img_len, 1] = row_ids position_ids[i, cap_len:cap_len+img_len, 2] = col_ids From c58c13b2bad6df0de93cc0cf107e96522a3cb5b3 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 31 Oct 2025 21:25:17 -0700 Subject: [PATCH 520/759] Fix torch compile regression on fp8 ops. (#10580) --- comfy/ops.py | 24 +++++------------ comfy/quant_ops.py | 27 +++++++++++++++---- .../comfy_quant/test_mixed_precision.py | 8 +++--- tests-unit/comfy_quant/test_quant_registry.py | 20 +++++++------- 4 files changed, 43 insertions(+), 36 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 18f6b804b..279f6b1a7 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -401,15 +401,9 @@ def fp8_linear(self, input): if dtype not in [torch.float8_e4m3fn]: return None - tensor_2d = False - if len(input.shape) == 2: - tensor_2d = True - input = input.unsqueeze(1) - - input_shape = input.shape input_dtype = input.dtype - if len(input.shape) == 3: + if input.ndim == 3 or input.ndim == 2: w, bias, offload_stream = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype, offloadable=True) scale_weight = self.scale_weight @@ -422,24 +416,20 @@ def fp8_linear(self, input): if scale_input is None: scale_input = torch.ones((), device=input.device, dtype=torch.float32) input = torch.clamp(input, min=-448, max=448, out=input) - input = input.reshape(-1, input_shape[2]).to(dtype).contiguous() layout_params_weight = {'scale': scale_input, 'orig_dtype': input_dtype} - quantized_input = QuantizedTensor(input.reshape(-1, input_shape[2]).to(dtype).contiguous(), TensorCoreFP8Layout, layout_params_weight) + quantized_input = QuantizedTensor(input.to(dtype).contiguous(), "TensorCoreFP8Layout", layout_params_weight) else: scale_input = scale_input.to(input.device) - quantized_input = QuantizedTensor.from_float(input.reshape(-1, input_shape[2]), TensorCoreFP8Layout, scale=scale_input, dtype=dtype) + quantized_input = QuantizedTensor.from_float(input, "TensorCoreFP8Layout", scale=scale_input, dtype=dtype) # Wrap weight in QuantizedTensor - this enables unified dispatch # Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py! layout_params_weight = {'scale': scale_weight, 'orig_dtype': input_dtype} - quantized_weight = QuantizedTensor(w, TensorCoreFP8Layout, layout_params_weight) + quantized_weight = QuantizedTensor(w, "TensorCoreFP8Layout", layout_params_weight) o = torch.nn.functional.linear(quantized_input, quantized_weight, bias) uncast_bias_weight(self, w, bias, offload_stream) - - if tensor_2d: - return o.reshape(input_shape[0], -1) - return o.reshape((-1, input_shape[1], self.weight.shape[0])) + return o return None @@ -540,12 +530,12 @@ if CUBLAS_IS_AVAILABLE: # ============================================================================== # Mixed Precision Operations # ============================================================================== -from .quant_ops import QuantizedTensor, TensorCoreFP8Layout +from .quant_ops import QuantizedTensor QUANT_FORMAT_MIXINS = { "float8_e4m3fn": { "dtype": torch.float8_e4m3fn, - "layout_type": TensorCoreFP8Layout, + "layout_type": "TensorCoreFP8Layout", "parameters": { "weight_scale": torch.nn.Parameter(torch.zeros((), dtype=torch.float32), requires_grad=False), "input_scale": torch.nn.Parameter(torch.zeros((), dtype=torch.float32), requires_grad=False), diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index c822fe53c..873f173ed 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -123,7 +123,7 @@ class QuantizedTensor(torch.Tensor): layout_type: Layout class (subclass of QuantizedLayout) layout_params: Dict with layout-specific parameters """ - return torch.Tensor._make_subclass(cls, qdata, require_grad=False) + return torch.Tensor._make_wrapper_subclass(cls, qdata.shape, device=qdata.device, dtype=qdata.dtype, requires_grad=False) def __init__(self, qdata, layout_type, layout_params): self._qdata = qdata.contiguous() @@ -183,11 +183,11 @@ class QuantizedTensor(torch.Tensor): @classmethod def from_float(cls, tensor, layout_type, **quantize_kwargs) -> 'QuantizedTensor': - qdata, layout_params = layout_type.quantize(tensor, **quantize_kwargs) + qdata, layout_params = LAYOUTS[layout_type].quantize(tensor, **quantize_kwargs) return cls(qdata, layout_type, layout_params) def dequantize(self) -> torch.Tensor: - return self._layout_type.dequantize(self._qdata, **self._layout_params) + return LAYOUTS[self._layout_type].dequantize(self._qdata, **self._layout_params) @classmethod def __torch_dispatch__(cls, func, types, args=(), kwargs=None): @@ -379,7 +379,12 @@ class TensorCoreFP8Layout(QuantizedLayout): return qtensor._qdata, qtensor._layout_params['scale'] -@register_layout_op(torch.ops.aten.linear.default, TensorCoreFP8Layout) +LAYOUTS = { + "TensorCoreFP8Layout": TensorCoreFP8Layout, +} + + +@register_layout_op(torch.ops.aten.linear.default, "TensorCoreFP8Layout") def fp8_linear(func, args, kwargs): input_tensor = args[0] weight = args[1] @@ -422,7 +427,7 @@ def fp8_linear(func, args, kwargs): 'scale': output_scale, 'orig_dtype': input_tensor._layout_params['orig_dtype'] } - return QuantizedTensor(output, TensorCoreFP8Layout, output_params) + return QuantizedTensor(output, "TensorCoreFP8Layout", output_params) else: return output @@ -436,3 +441,15 @@ def fp8_linear(func, args, kwargs): input_tensor = input_tensor.dequantize() return torch.nn.functional.linear(input_tensor, weight, bias) + + +@register_layout_op(torch.ops.aten.view.default, "TensorCoreFP8Layout") +@register_layout_op(torch.ops.aten.t.default, "TensorCoreFP8Layout") +def fp8_func(func, args, kwargs): + input_tensor = args[0] + if isinstance(input_tensor, QuantizedTensor): + plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor) + ar = list(args) + ar[0] = plain_input + return QuantizedTensor(func(*ar, **kwargs), "TensorCoreFP8Layout", input_tensor._layout_params) + return func(*args, **kwargs) diff --git a/tests-unit/comfy_quant/test_mixed_precision.py b/tests-unit/comfy_quant/test_mixed_precision.py index 267bc177b..f8d1fd04e 100644 --- a/tests-unit/comfy_quant/test_mixed_precision.py +++ b/tests-unit/comfy_quant/test_mixed_precision.py @@ -14,7 +14,7 @@ if not has_gpu(): args.cpu = True from comfy import ops -from comfy.quant_ops import QuantizedTensor, TensorCoreFP8Layout +from comfy.quant_ops import QuantizedTensor class SimpleModel(torch.nn.Module): @@ -104,14 +104,14 @@ class TestMixedPrecisionOps(unittest.TestCase): # Verify weights are wrapped in QuantizedTensor self.assertIsInstance(model.layer1.weight, QuantizedTensor) - self.assertEqual(model.layer1.weight._layout_type, TensorCoreFP8Layout) + self.assertEqual(model.layer1.weight._layout_type, "TensorCoreFP8Layout") # Layer 2 should NOT be quantized self.assertNotIsInstance(model.layer2.weight, QuantizedTensor) # Layer 3 should be quantized self.assertIsInstance(model.layer3.weight, QuantizedTensor) - self.assertEqual(model.layer3.weight._layout_type, TensorCoreFP8Layout) + self.assertEqual(model.layer3.weight._layout_type, "TensorCoreFP8Layout") # Verify scales were loaded self.assertEqual(model.layer1.weight._layout_params['scale'].item(), 2.0) @@ -155,7 +155,7 @@ class TestMixedPrecisionOps(unittest.TestCase): # Verify layer1.weight is a QuantizedTensor with scale preserved self.assertIsInstance(state_dict2["layer1.weight"], QuantizedTensor) self.assertEqual(state_dict2["layer1.weight"]._layout_params['scale'].item(), 3.0) - self.assertEqual(state_dict2["layer1.weight"]._layout_type, TensorCoreFP8Layout) + self.assertEqual(state_dict2["layer1.weight"]._layout_type, "TensorCoreFP8Layout") # Verify non-quantized layers are standard tensors self.assertNotIsInstance(state_dict2["layer2.weight"], QuantizedTensor) diff --git a/tests-unit/comfy_quant/test_quant_registry.py b/tests-unit/comfy_quant/test_quant_registry.py index 477811029..9cb54ede8 100644 --- a/tests-unit/comfy_quant/test_quant_registry.py +++ b/tests-unit/comfy_quant/test_quant_registry.py @@ -25,14 +25,14 @@ class TestQuantizedTensor(unittest.TestCase): scale = torch.tensor(2.0) layout_params = {'scale': scale, 'orig_dtype': torch.bfloat16} - qt = QuantizedTensor(fp8_data, TensorCoreFP8Layout, layout_params) + qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params) self.assertIsInstance(qt, QuantizedTensor) self.assertEqual(qt.shape, (256, 128)) self.assertEqual(qt.dtype, torch.float8_e4m3fn) self.assertEqual(qt._layout_params['scale'], scale) self.assertEqual(qt._layout_params['orig_dtype'], torch.bfloat16) - self.assertEqual(qt._layout_type, TensorCoreFP8Layout) + self.assertEqual(qt._layout_type, "TensorCoreFP8Layout") def test_dequantize(self): """Test explicit dequantization""" @@ -41,7 +41,7 @@ class TestQuantizedTensor(unittest.TestCase): scale = torch.tensor(3.0) layout_params = {'scale': scale, 'orig_dtype': torch.float32} - qt = QuantizedTensor(fp8_data, TensorCoreFP8Layout, layout_params) + qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params) dequantized = qt.dequantize() self.assertEqual(dequantized.dtype, torch.float32) @@ -54,7 +54,7 @@ class TestQuantizedTensor(unittest.TestCase): qt = QuantizedTensor.from_float( float_tensor, - TensorCoreFP8Layout, + "TensorCoreFP8Layout", scale=scale, dtype=torch.float8_e4m3fn ) @@ -77,28 +77,28 @@ class TestGenericUtilities(unittest.TestCase): fp8_data = torch.randn(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn) scale = torch.tensor(1.5) layout_params = {'scale': scale, 'orig_dtype': torch.float32} - qt = QuantizedTensor(fp8_data, TensorCoreFP8Layout, layout_params) + qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params) # Detach should return a new QuantizedTensor qt_detached = qt.detach() self.assertIsInstance(qt_detached, QuantizedTensor) self.assertEqual(qt_detached.shape, qt.shape) - self.assertEqual(qt_detached._layout_type, TensorCoreFP8Layout) + self.assertEqual(qt_detached._layout_type, "TensorCoreFP8Layout") def test_clone(self): """Test clone operation on quantized tensor""" fp8_data = torch.randn(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn) scale = torch.tensor(1.5) layout_params = {'scale': scale, 'orig_dtype': torch.float32} - qt = QuantizedTensor(fp8_data, TensorCoreFP8Layout, layout_params) + qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params) # Clone should return a new QuantizedTensor qt_cloned = qt.clone() self.assertIsInstance(qt_cloned, QuantizedTensor) self.assertEqual(qt_cloned.shape, qt.shape) - self.assertEqual(qt_cloned._layout_type, TensorCoreFP8Layout) + self.assertEqual(qt_cloned._layout_type, "TensorCoreFP8Layout") # Verify it's a deep copy self.assertIsNot(qt_cloned._qdata, qt._qdata) @@ -109,7 +109,7 @@ class TestGenericUtilities(unittest.TestCase): fp8_data = torch.randn(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn) scale = torch.tensor(1.5) layout_params = {'scale': scale, 'orig_dtype': torch.float32} - qt = QuantizedTensor(fp8_data, TensorCoreFP8Layout, layout_params) + qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params) # Moving to same device should work (CPU to CPU) qt_cpu = qt.to('cpu') @@ -169,7 +169,7 @@ class TestFallbackMechanism(unittest.TestCase): scale = torch.tensor(1.0) a_q = QuantizedTensor.from_float( a_fp32, - TensorCoreFP8Layout, + "TensorCoreFP8Layout", scale=scale, dtype=torch.float8_e4m3fn ) From 5f109fe6a06a3462b31a066bcfd650de67d66102 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 1 Nov 2025 21:13:39 +0200 Subject: [PATCH 521/759] added 12s-20s as available output durations for the LTXV API nodes (#10570) --- comfy_api_nodes/nodes_ltxv.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/comfy_api_nodes/nodes_ltxv.py b/comfy_api_nodes/nodes_ltxv.py index e6ad6e27a..0b757a62b 100644 --- a/comfy_api_nodes/nodes_ltxv.py +++ b/comfy_api_nodes/nodes_ltxv.py @@ -46,7 +46,7 @@ class TextToVideoNode(IO.ComfyNode): multiline=True, default="", ), - IO.Combo.Input("duration", options=[6, 8, 10], default=8), + IO.Combo.Input("duration", options=[6, 8, 10, 12, 14, 16, 18, 20], default=8), IO.Combo.Input( "resolution", options=[ @@ -85,6 +85,10 @@ class TextToVideoNode(IO.ComfyNode): generate_audio: bool = False, ) -> IO.NodeOutput: validate_string(prompt, min_length=1, max_length=10000) + if duration > 10 and (model != "LTX-2 (Fast)" or resolution != "1920x1080" or fps != 25): + raise ValueError( + "Durations over 10s are only available for the Fast model at 1920x1080 resolution and 25 FPS." + ) response = await sync_op_raw( cls, ApiEndpoint("/proxy/ltx/v1/text-to-video", "POST"), @@ -118,7 +122,7 @@ class ImageToVideoNode(IO.ComfyNode): multiline=True, default="", ), - IO.Combo.Input("duration", options=[6, 8, 10], default=8), + IO.Combo.Input("duration", options=[6, 8, 10, 12, 14, 16, 18, 20], default=8), IO.Combo.Input( "resolution", options=[ @@ -158,6 +162,10 @@ class ImageToVideoNode(IO.ComfyNode): generate_audio: bool = False, ) -> IO.NodeOutput: validate_string(prompt, min_length=1, max_length=10000) + if duration > 10 and (model != "LTX-2 (Fast)" or resolution != "1920x1080" or fps != 25): + raise ValueError( + "Durations over 10s are only available for the Fast model at 1920x1080 resolution and 25 FPS." + ) if get_number_of_images(image) != 1: raise ValueError("Currently only one input image is supported.") response = await sync_op_raw( From 20182a393f43ab1fdf798f8da6aac0ef6116e7e6 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 1 Nov 2025 21:14:06 +0200 Subject: [PATCH 522/759] convert StabilityAI to use new API client (#10582) --- comfy_api_nodes/nodes_stability.py | 179 ++++++++--------------------- comfy_api_nodes/util/client.py | 4 +- 2 files changed, 48 insertions(+), 135 deletions(-) diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index 783666ddf..bb7ceed78 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -20,13 +20,6 @@ from comfy_api_nodes.apis.stability_api import ( StabilityAudioInpaintRequest, StabilityAudioResponse, ) -from comfy_api_nodes.apis.client import ( - ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) from comfy_api_nodes.util import ( validate_audio_duration, validate_string, @@ -34,6 +27,9 @@ from comfy_api_nodes.util import ( bytesio_to_image_tensor, tensor_to_bytesio, audio_bytes_to_audio_input, + sync_op, + poll_op, + ApiEndpoint, ) import torch @@ -161,19 +157,11 @@ class StabilityStableImageUltraNode(IO.ComfyNode): "image": image_binary } - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/stable-image/generate/ultra", - method=HttpMethod.POST, - request_model=StabilityStableUltraRequest, - response_model=StabilityStableUltraResponse, - ), - request=StabilityStableUltraRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/stable-image/generate/ultra", method="POST"), + response_model=StabilityStableUltraResponse, + data=StabilityStableUltraRequest( prompt=prompt, negative_prompt=negative_prompt, aspect_ratio=aspect_ratio, @@ -183,9 +171,7 @@ class StabilityStableImageUltraNode(IO.ComfyNode): ), files=files, content_type="multipart/form-data", - auth_kwargs=auth, ) - response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stable Image Ultra generation failed: {response_api.finish_reason}.") @@ -313,19 +299,11 @@ class StabilityStableImageSD_3_5Node(IO.ComfyNode): "image": image_binary } - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/stable-image/generate/sd3", - method=HttpMethod.POST, - request_model=StabilityStable3_5Request, - response_model=StabilityStableUltraResponse, - ), - request=StabilityStable3_5Request( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/stable-image/generate/sd3", method="POST"), + response_model=StabilityStableUltraResponse, + data=StabilityStable3_5Request( prompt=prompt, negative_prompt=negative_prompt, aspect_ratio=aspect_ratio, @@ -338,9 +316,7 @@ class StabilityStableImageSD_3_5Node(IO.ComfyNode): ), files=files, content_type="multipart/form-data", - auth_kwargs=auth, ) - response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stable Diffusion 3.5 Image generation failed: {response_api.finish_reason}.") @@ -427,19 +403,11 @@ class StabilityUpscaleConservativeNode(IO.ComfyNode): "image": image_binary } - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/stable-image/upscale/conservative", - method=HttpMethod.POST, - request_model=StabilityUpscaleConservativeRequest, - response_model=StabilityStableUltraResponse, - ), - request=StabilityUpscaleConservativeRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/stable-image/upscale/conservative", method="POST"), + response_model=StabilityStableUltraResponse, + data=StabilityUpscaleConservativeRequest( prompt=prompt, negative_prompt=negative_prompt, creativity=round(creativity,2), @@ -447,9 +415,7 @@ class StabilityUpscaleConservativeNode(IO.ComfyNode): ), files=files, content_type="multipart/form-data", - auth_kwargs=auth, ) - response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stability Upscale Conservative generation failed: {response_api.finish_reason}.") @@ -544,19 +510,11 @@ class StabilityUpscaleCreativeNode(IO.ComfyNode): "image": image_binary } - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/stable-image/upscale/creative", - method=HttpMethod.POST, - request_model=StabilityUpscaleCreativeRequest, - response_model=StabilityAsyncResponse, - ), - request=StabilityUpscaleCreativeRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/stable-image/upscale/creative", method="POST"), + response_model=StabilityAsyncResponse, + data=StabilityUpscaleCreativeRequest( prompt=prompt, negative_prompt=negative_prompt, creativity=round(creativity,2), @@ -565,25 +523,15 @@ class StabilityUpscaleCreativeNode(IO.ComfyNode): ), files=files, content_type="multipart/form-data", - auth_kwargs=auth, ) - response_api = await operation.execute() - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/stability/v2beta/results/{response_api.id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=StabilityResultsGetResponse, - ), + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/stability/v2beta/results/{response_api.id}"), + response_model=StabilityResultsGetResponse, poll_interval=3, - completed_statuses=[StabilityPollStatus.finished], - failed_statuses=[StabilityPollStatus.failed], status_extractor=lambda x: get_async_dummy_status(x), - auth_kwargs=auth, - node_id=cls.hidden.unique_id, ) - response_poll: StabilityResultsGetResponse = await operation.execute() if response_poll.finish_reason != "SUCCESS": raise Exception(f"Stability Upscale Creative generation failed: {response_poll.finish_reason}.") @@ -628,24 +576,13 @@ class StabilityUpscaleFastNode(IO.ComfyNode): "image": image_binary } - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/stable-image/upscale/fast", - method=HttpMethod.POST, - request_model=EmptyRequest, - response_model=StabilityStableUltraResponse, - ), - request=EmptyRequest(), + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/stable-image/upscale/fast", method="POST"), + response_model=StabilityStableUltraResponse, files=files, content_type="multipart/form-data", - auth_kwargs=auth, ) - response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stability Upscale Fast failed: {response_api.finish_reason}.") @@ -717,21 +654,13 @@ class StabilityTextToAudio(IO.ComfyNode): async def execute(cls, model: str, prompt: str, duration: int, seed: int, steps: int) -> IO.NodeOutput: validate_string(prompt, max_length=10000) payload = StabilityTextToAudioRequest(prompt=prompt, model=model, duration=duration, seed=seed, steps=steps) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/audio/stable-audio-2/text-to-audio", - method=HttpMethod.POST, - request_model=StabilityTextToAudioRequest, - response_model=StabilityAudioResponse, - ), - request=payload, + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/audio/stable-audio-2/text-to-audio", method="POST"), + response_model=StabilityAudioResponse, + data=payload, content_type="multipart/form-data", - auth_kwargs= { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, ) - response_api = await operation.execute() if not response_api.audio: raise ValueError("No audio file was received in response.") return IO.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) @@ -814,22 +743,14 @@ class StabilityAudioToAudio(IO.ComfyNode): payload = StabilityAudioToAudioRequest( prompt=prompt, model=model, duration=duration, seed=seed, steps=steps, strength=strength ) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/audio/stable-audio-2/audio-to-audio", - method=HttpMethod.POST, - request_model=StabilityAudioToAudioRequest, - response_model=StabilityAudioResponse, - ), - request=payload, + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/audio/stable-audio-2/audio-to-audio", method="POST"), + response_model=StabilityAudioResponse, + data=payload, content_type="multipart/form-data", files={"audio": audio_input_to_mp3(audio)}, - auth_kwargs= { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, ) - response_api = await operation.execute() if not response_api.audio: raise ValueError("No audio file was received in response.") return IO.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) @@ -935,22 +856,14 @@ class StabilityAudioInpaint(IO.ComfyNode): mask_start=mask_start, mask_end=mask_end, ) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/audio/stable-audio-2/inpaint", - method=HttpMethod.POST, - request_model=StabilityAudioInpaintRequest, - response_model=StabilityAudioResponse, - ), - request=payload, + response_api = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/stability/v2beta/audio/stable-audio-2/inpaint", method="POST"), + response_model=StabilityAudioResponse, + data=payload, content_type="multipart/form-data", files={"audio": audio_input_to_mp3(audio)}, - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, ) - response_api = await operation.execute() if not response_api.audio: raise ValueError("No audio file was received in response.") return IO.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py index 9ae512fe5..65bb35f0f 100644 --- a/comfy_api_nodes/util/client.py +++ b/comfy_api_nodes/util/client.py @@ -77,7 +77,7 @@ class _PollUIState: _RETRY_STATUS = {408, 429, 500, 502, 503, 504} -COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed"] +COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished"] FAILED_STATUSES = ["cancelled", "canceled", "fail", "failed", "error"] QUEUED_STATUSES = ["created", "queued", "queueing", "submitted"] @@ -589,7 +589,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): operation_id = _generate_operation_id(method, cfg.endpoint.path, attempt) logging.debug("[DEBUG] HTTP %s %s (attempt %d)", method, url, attempt) - payload_headers = {"Accept": "*/*"} + payload_headers = {"Accept": "*/*"} if expect_binary else {"Accept": "application/json"} if not parsed_url.scheme and not parsed_url.netloc: # is URL relative? payload_headers.update(get_auth_header(cfg.node_cls)) if cfg.endpoint.headers: From 44869ff786dc90b36172fd766c9a110e4c40c04b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 1 Nov 2025 14:25:59 -0700 Subject: [PATCH 523/759] Fix issue with pinned memory. (#10597) --- comfy/model_patcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 3e8799983..5a31a8734 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -298,6 +298,7 @@ class ModelPatcher: n.backup = self.backup n.object_patches_backup = self.object_patches_backup n.parent = self + n.pinned = self.pinned n.force_cast_weights = self.force_cast_weights From 135fa49ec23320834f774cf3def9e51ad3773f86 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Sun, 2 Nov 2025 08:48:53 +1000 Subject: [PATCH 524/759] Small speed improvements to --async-offload (#10593) * ops: dont take an offload stream if you dont need one * ops: prioritize mem transfer The async offload streams reason for existence is to transfer from RAM to GPU. The post processing compute steps are a bonus on the side stream, but if the compute stream is running a long kernel, it can stall the side stream, as it wait to type-cast the bias before transferring the weight. So do a pure xfer of the weight straight up, then do everything bias, then go back to fix the weight type and do weight patches. --- comfy/ops.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 279f6b1a7..0c8f23848 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -84,7 +84,8 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of if device is None: device = input.device - if offloadable: + if offloadable and (device != s.weight.device or + (s.bias is not None and device != s.bias.device)): offload_stream = comfy.model_management.get_offload_stream(device) else: offload_stream = None @@ -94,20 +95,24 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of else: wf_context = contextlib.nullcontext() - bias = None non_blocking = comfy.model_management.device_supports_non_blocking(device) - if s.bias is not None: - has_function = len(s.bias_function) > 0 - bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream) - if has_function: + weight_has_function = len(s.weight_function) > 0 + bias_has_function = len(s.bias_function) > 0 + + weight = comfy.model_management.cast_to(s.weight, None, device, non_blocking=non_blocking, copy=weight_has_function, stream=offload_stream) + + bias = None + if s.bias is not None: + bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=bias_has_function, stream=offload_stream) + + if bias_has_function: with wf_context: for f in s.bias_function: bias = f(bias) - has_function = len(s.weight_function) > 0 - weight = comfy.model_management.cast_to(s.weight, dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream) - if has_function: + weight = weight.to(dtype=dtype) + if weight_has_function: with wf_context: for f in s.weight_function: weight = f(weight) From 97ff9fae7e728cffdfc3aee6d72aa1e0d0b78702 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 2 Nov 2025 10:14:04 -0800 Subject: [PATCH 525/759] Clarify help text for --fast argument (#10609) Updated help text for the --fast argument to clarify potential risks. --- comfy/cli_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 3d5bc7c90..3947e62a8 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -147,7 +147,7 @@ class PerformanceFeature(enum.Enum): AutoTune = "autotune" PinnedMem = "pinned_memory" -parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature)))) +parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. This is used to test new features so using it might crash your comfyui. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature)))) parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.") parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.") From 6d6a18b0b730351416556eeb0990ab219ffec189 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 3 Nov 2025 10:04:56 +0200 Subject: [PATCH 526/759] fix(api-nodes-cloud): stop using sub-folder and absolute path for output of Rodin3D nodes (#10556) --- comfy_api_nodes/nodes_rodin.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index cf2172bd6..ad4029236 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -225,21 +225,20 @@ async def get_rodin_download_list(uuid, auth_kwargs: Optional[dict[str, str]] = async def download_files(url_list, task_uuid): - save_path = os.path.join(comfy_paths.get_output_directory(), f"Rodin3D_{task_uuid}") + result_folder_name = f"Rodin3D_{task_uuid}" + save_path = os.path.join(comfy_paths.get_output_directory(), result_folder_name) os.makedirs(save_path, exist_ok=True) model_file_path = None async with aiohttp.ClientSession() as session: for i in url_list.list: - url = i.url - file_name = i.name - file_path = os.path.join(save_path, file_name) + file_path = os.path.join(save_path, i.name) if file_path.endswith(".glb"): - model_file_path = file_path + model_file_path = os.path.join(result_folder_name, i.name) logging.info("[ Rodin3D API - download_files ] Downloading file: %s", file_path) max_retries = 5 for attempt in range(max_retries): try: - async with session.get(url) as resp: + async with session.get(i.url) as resp: resp.raise_for_status() with open(file_path, "wb") as f: async for chunk in resp.content.iter_chunked(32 * 1024): From 88df172790b8ed7b2e6ea7c0f0bd63ca3553921b Mon Sep 17 00:00:00 2001 From: EverNebula Date: Mon, 3 Nov 2025 16:16:40 +0800 Subject: [PATCH 527/759] fix(caching): treat bytes as hashable (#10567) --- comfy_execution/caching.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py index b498f43e7..e077f78b0 100644 --- a/comfy_execution/caching.py +++ b/comfy_execution/caching.py @@ -53,7 +53,7 @@ class Unhashable: def to_hashable(obj): # So that we don't infinitely recurse since frozenset and tuples # are Sequences. - if isinstance(obj, (int, float, str, bool, type(None))): + if isinstance(obj, (int, float, str, bool, bytes, type(None))): return obj elif isinstance(obj, Mapping): return frozenset([(to_hashable(k), to_hashable(v)) for k, v in sorted(obj.items())]) From 1f3f7a2823017ad193e060d9221fb6a52f2eba3a Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 3 Nov 2025 10:21:47 +0200 Subject: [PATCH 528/759] convert nodes_hypernetwork.py to V3 schema (#10583) --- comfy_extras/nodes_hypernetwork.py | 48 ++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/comfy_extras/nodes_hypernetwork.py b/comfy_extras/nodes_hypernetwork.py index 665632292..2a6a87a81 100644 --- a/comfy_extras/nodes_hypernetwork.py +++ b/comfy_extras/nodes_hypernetwork.py @@ -2,6 +2,9 @@ import comfy.utils import folder_paths import torch import logging +from comfy_api.latest import IO, ComfyExtension +from typing_extensions import override + def load_hypernetwork_patch(path, strength): sd = comfy.utils.load_torch_file(path, safe_load=True) @@ -94,27 +97,42 @@ def load_hypernetwork_patch(path, strength): return hypernetwork_patch(out, strength) -class HypernetworkLoader: +class HypernetworkLoader(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "hypernetwork_name": (folder_paths.get_filename_list("hypernetworks"), ), - "strength": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "load_hypernetwork" + def define_schema(cls): + return IO.Schema( + node_id="HypernetworkLoader", + category="loaders", + inputs=[ + IO.Model.Input("model"), + IO.Combo.Input("hypernetwork_name", options=folder_paths.get_filename_list("hypernetworks")), + IO.Float.Input("strength", default=1.0, min=-10.0, max=10.0, step=0.01), + ], + outputs=[ + IO.Model.Output(), + ], + ) - CATEGORY = "loaders" - - def load_hypernetwork(self, model, hypernetwork_name, strength): + @classmethod + def execute(cls, model, hypernetwork_name, strength) -> IO.NodeOutput: hypernetwork_path = folder_paths.get_full_path_or_raise("hypernetworks", hypernetwork_name) model_hypernetwork = model.clone() patch = load_hypernetwork_patch(hypernetwork_path, strength) if patch is not None: model_hypernetwork.set_model_attn1_patch(patch) model_hypernetwork.set_model_attn2_patch(patch) - return (model_hypernetwork,) + return IO.NodeOutput(model_hypernetwork) -NODE_CLASS_MAPPINGS = { - "HypernetworkLoader": HypernetworkLoader -} + load_hypernetwork = execute # TODO: remove + + +class HyperNetworkExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + HypernetworkLoader, + ] + + +async def comfy_entrypoint() -> HyperNetworkExtension: + return HyperNetworkExtension() From e617cddf244e4b789afba4b4ece01661b12fdde5 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 3 Nov 2025 10:28:13 +0200 Subject: [PATCH 529/759] convert nodes_openai.py to V3 schema (#10604) --- comfy_api_nodes/apinode_utils.py | 73 -- comfy_api_nodes/nodes_openai.py | 1094 +++++++++++---------------- comfy_api_nodes/util/__init__.py | 4 + comfy_api_nodes/util/conversions.py | 19 +- 4 files changed, 482 insertions(+), 708 deletions(-) delete mode 100644 comfy_api_nodes/apinode_utils.py diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py deleted file mode 100644 index ecd604ff8..000000000 --- a/comfy_api_nodes/apinode_utils.py +++ /dev/null @@ -1,73 +0,0 @@ -from __future__ import annotations -import aiohttp -import mimetypes -from typing import Union -from server import PromptServer - -import numpy as np -from PIL import Image -import torch -import base64 -from io import BytesIO - - -async def validate_and_cast_response( - response, timeout: int = None, node_id: Union[str, None] = None -) -> torch.Tensor: - """Validates and casts a response to a torch.Tensor. - - Args: - response: The response to validate and cast. - timeout: Request timeout in seconds. Defaults to None (no timeout). - - Returns: - A torch.Tensor representing the image (1, H, W, C). - - Raises: - ValueError: If the response is not valid. - """ - # validate raw JSON response - data = response.data - if not data or len(data) == 0: - raise ValueError("No images returned from API endpoint") - - # Initialize list to store image tensors - image_tensors: list[torch.Tensor] = [] - - # Process each image in the data array - async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=timeout)) as session: - for img_data in data: - img_bytes: bytes - if img_data.b64_json: - img_bytes = base64.b64decode(img_data.b64_json) - elif img_data.url: - if node_id: - PromptServer.instance.send_progress_text(f"Result URL: {img_data.url}", node_id) - async with session.get(img_data.url) as resp: - if resp.status != 200: - raise ValueError("Failed to download generated image") - img_bytes = await resp.read() - else: - raise ValueError("Invalid image payload – neither URL nor base64 data present.") - - pil_img = Image.open(BytesIO(img_bytes)).convert("RGBA") - arr = np.asarray(pil_img).astype(np.float32) / 255.0 - image_tensors.append(torch.from_numpy(arr)) - - return torch.stack(image_tensors, dim=0) - - -def text_filepath_to_base64_string(filepath: str) -> str: - """Converts a text file to a base64 string.""" - with open(filepath, "rb") as f: - file_content = f.read() - return base64.b64encode(file_content).decode("utf-8") - - -def text_filepath_to_data_uri(filepath: str) -> str: - """Converts a text file to a data URI.""" - base64_string = text_filepath_to_base64_string(filepath) - mime_type, _ = mimetypes.guess_type(filepath) - if mime_type is None: - mime_type = "application/octet-stream" - return f"data:{mime_type};base64,{base64_string}" diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index b4568fc85..acf35d276 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -1,18 +1,19 @@ -import io -from typing import TypedDict, Optional +from io import BytesIO +from typing import Optional, Union import json import os import time -import re import uuid from enum import Enum from inspect import cleandoc import numpy as np import torch from PIL import Image -from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeDict from server import PromptServer import folder_paths +import base64 +from comfy_api.latest import IO, ComfyExtension +from typing_extensions import override from comfy_api_nodes.apis import ( @@ -23,7 +24,6 @@ from comfy_api_nodes.apis import ( OpenAIResponse, CreateModelResponseProperties, Item, - Includable, OutputContent, InputImageContent, Detail, @@ -34,41 +34,22 @@ from comfy_api_nodes.apis import ( InputFileContent, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( + downscale_image_tensor, + download_url_to_bytesio, + validate_string, + tensor_to_base64_string, ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) - -from comfy_api_nodes.apinode_utils import ( - validate_and_cast_response, + sync_op, + poll_op, text_filepath_to_data_uri, ) -from comfy_api_nodes.mapper_utils import model_field_to_node_input -from comfy_api_nodes.util import downscale_image_tensor, validate_string, tensor_to_base64_string RESPONSES_ENDPOINT = "/proxy/openai/v1/responses" STARTING_POINT_ID_PATTERN = r"" -class HistoryEntry(TypedDict): - """Type definition for a single history entry in the chat.""" - - prompt: str - response: str - response_id: str - timestamp: float - - -class ChatHistory(TypedDict): - """Type definition for the chat history dictionary.""" - - __annotations__: dict[str, list[HistoryEntry]] - - class SupportedOpenAIModel(str, Enum): o4_mini = "o4-mini" o1 = "o1" @@ -83,98 +64,123 @@ class SupportedOpenAIModel(str, Enum): gpt_5_nano = "gpt-5-nano" -class OpenAIDalle2(ComfyNodeABC): +async def validate_and_cast_response(response, timeout: int = None) -> torch.Tensor: + """Validates and casts a response to a torch.Tensor. + + Args: + response: The response to validate and cast. + timeout: Request timeout in seconds. Defaults to None (no timeout). + + Returns: + A torch.Tensor representing the image (1, H, W, C). + + Raises: + ValueError: If the response is not valid. + """ + # validate raw JSON response + data = response.data + if not data or len(data) == 0: + raise ValueError("No images returned from API endpoint") + + # Initialize list to store image tensors + image_tensors: list[torch.Tensor] = [] + + # Process each image in the data array + for img_data in data: + if img_data.b64_json: + img_io = BytesIO(base64.b64decode(img_data.b64_json)) + elif img_data.url: + img_io = BytesIO() + await download_url_to_bytesio(img_data.url, img_io, timeout=timeout) + else: + raise ValueError("Invalid image payload – neither URL nor base64 data present.") + + pil_img = Image.open(img_io).convert("RGBA") + arr = np.asarray(pil_img).astype(np.float32) / 255.0 + image_tensors.append(torch.from_numpy(arr)) + + return torch.stack(image_tensors, dim=0) + + +class OpenAIDalle2(IO.ComfyNode): """ Generates images synchronously via OpenAI's DALL·E 2 endpoint. """ - def __init__(self): - pass + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="OpenAIDalle2", + display_name="OpenAI DALL·E 2", + category="api node/image/OpenAI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.String.Input( + "prompt", + default="", + multiline=True, + tooltip="Text prompt for DALL·E", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2**31 - 1, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="not implemented yet in backend", + optional=True, + ), + IO.Combo.Input( + "size", + default="1024x1024", + options=["256x256", "512x512", "1024x1024"], + tooltip="Image size", + optional=True, + ), + IO.Int.Input( + "n", + default=1, + min=1, + max=8, + step=1, + tooltip="How many images to generate", + display_mode=IO.NumberDisplay.number, + optional=True, + ), + IO.Image.Input( + "image", + tooltip="Optional reference image for image editing.", + optional=True, + ), + IO.Mask.Input( + "mask", + tooltip="Optional mask for inpainting (white areas will be replaced)", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text prompt for DALL·E", - }, - ), - }, - "optional": { - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2**31 - 1, - "step": 1, - "display": "number", - "control_after_generate": True, - "tooltip": "not implemented yet in backend", - }, - ), - "size": ( - IO.COMBO, - { - "options": ["256x256", "512x512", "1024x1024"], - "default": "1024x1024", - "tooltip": "Image size", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 8, - "step": 1, - "display": "number", - "tooltip": "How many images to generate", - }, - ), - "image": ( - IO.IMAGE, - { - "default": None, - "tooltip": "Optional reference image for image editing.", - }, - ), - "mask": ( - IO.MASK, - { - "default": None, - "tooltip": "Optional mask for inpainting (white areas will be replaced)", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = (IO.IMAGE,) - FUNCTION = "api_call" - CATEGORY = "api node/image/OpenAI" - DESCRIPTION = cleandoc(__doc__ or "") - API_NODE = True - - async def api_call( - self, + async def execute( + cls, prompt, seed=0, image=None, mask=None, n=1, size="1024x1024", - unique_id=None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) model = "dall-e-2" path = "/proxy/openai/images/generations" @@ -200,7 +206,7 @@ class OpenAIDalle2(ComfyNodeABC): image_np = (rgba_tensor.numpy() * 255).astype(np.uint8) img = Image.fromarray(image_np) - img_byte_arr = io.BytesIO() + img_byte_arr = BytesIO() img.save(img_byte_arr, format="PNG") img_byte_arr.seek(0) img_binary = img_byte_arr # .getvalue() @@ -208,15 +214,11 @@ class OpenAIDalle2(ComfyNodeABC): elif image is not None or mask is not None: raise Exception("Dall-E 2 image editing requires an image AND a mask") - # Build the operation - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=request_class, - response_model=OpenAIImageGenerationResponse, - ), - request=request_class( + response = await sync_op( + cls, + ApiEndpoint(path=path, method="POST"), + response_model=OpenAIImageGenerationResponse, + data=request_class( model=model, prompt=prompt, n=n, @@ -231,109 +233,92 @@ class OpenAIDalle2(ComfyNodeABC): else None ), content_type=content_type, - auth_kwargs=kwargs, ) - response = await operation.execute() - - img_tensor = await validate_and_cast_response(response, node_id=unique_id) - return (img_tensor,) + return IO.NodeOutput(await validate_and_cast_response(response)) -class OpenAIDalle3(ComfyNodeABC): +class OpenAIDalle3(IO.ComfyNode): """ Generates images synchronously via OpenAI's DALL·E 3 endpoint. """ - def __init__(self): - pass + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="OpenAIDalle3", + display_name="OpenAI DALL·E 3", + category="api node/image/OpenAI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.String.Input( + "prompt", + default="", + multiline=True, + tooltip="Text prompt for DALL·E", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2 ** 31 - 1, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="not implemented yet in backend", + optional=True, + ), + IO.Combo.Input( + "quality", + default="standard", + options=["standard", "hd"], + tooltip="Image quality", + optional=True, + ), + IO.Combo.Input( + "style", + default="natural", + options=["natural", "vivid"], + tooltip="Vivid causes the model to lean towards generating hyper-real and dramatic images. Natural causes the model to produce more natural, less hyper-real looking images.", + optional=True, + ), + IO.Combo.Input( + "size", + default="1024x1024", + options=["1024x1024", "1024x1792", "1792x1024"], + tooltip="Image size", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text prompt for DALL·E", - }, - ), - }, - "optional": { - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2**31 - 1, - "step": 1, - "display": "number", - "control_after_generate": True, - "tooltip": "not implemented yet in backend", - }, - ), - "quality": ( - IO.COMBO, - { - "options": ["standard", "hd"], - "default": "standard", - "tooltip": "Image quality", - }, - ), - "style": ( - IO.COMBO, - { - "options": ["natural", "vivid"], - "default": "natural", - "tooltip": "Vivid causes the model to lean towards generating hyper-real and dramatic images. Natural causes the model to produce more natural, less hyper-real looking images.", - }, - ), - "size": ( - IO.COMBO, - { - "options": ["1024x1024", "1024x1792", "1792x1024"], - "default": "1024x1024", - "tooltip": "Image size", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = (IO.IMAGE,) - FUNCTION = "api_call" - CATEGORY = "api node/image/OpenAI" - DESCRIPTION = cleandoc(__doc__ or "") - API_NODE = True - - async def api_call( - self, + async def execute( + cls, prompt, seed=0, style="natural", quality="standard", size="1024x1024", - unique_id=None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) model = "dall-e-3" # build the operation - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/openai/images/generations", - method=HttpMethod.POST, - request_model=OpenAIImageGenerationRequest, - response_model=OpenAIImageGenerationResponse, - ), - request=OpenAIImageGenerationRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/openai/images/generations", method="POST"), + response_model=OpenAIImageGenerationResponse, + data=OpenAIImageGenerationRequest( model=model, prompt=prompt, quality=quality, @@ -341,114 +326,97 @@ class OpenAIDalle3(ComfyNodeABC): style=style, seed=seed, ), - auth_kwargs=kwargs, ) - response = await operation.execute() - - img_tensor = await validate_and_cast_response(response, node_id=unique_id) - return (img_tensor,) + return IO.NodeOutput(await validate_and_cast_response(response)) -class OpenAIGPTImage1(ComfyNodeABC): +class OpenAIGPTImage1(IO.ComfyNode): """ Generates images synchronously via OpenAI's GPT Image 1 endpoint. """ - def __init__(self): - pass + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="OpenAIGPTImage1", + display_name="OpenAI GPT Image 1", + category="api node/image/OpenAI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.String.Input( + "prompt", + default="", + multiline=True, + tooltip="Text prompt for GPT Image 1", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2 ** 31 - 1, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="not implemented yet in backend", + optional=True, + ), + IO.Combo.Input( + "quality", + default="low", + options=["low", "medium", "high"], + tooltip="Image quality, affects cost and generation time.", + optional=True, + ), + IO.Combo.Input( + "background", + default="opaque", + options=["opaque", "transparent"], + tooltip="Return image with or without background", + optional=True, + ), + IO.Combo.Input( + "size", + default="auto", + options=["auto", "1024x1024", "1024x1536", "1536x1024"], + tooltip="Image size", + optional=True, + ), + IO.Int.Input( + "n", + default=1, + min=1, + max=8, + step=1, + tooltip="How many images to generate", + display_mode=IO.NumberDisplay.number, + optional=True, + ), + IO.Image.Input( + "image", + tooltip="Optional reference image for image editing.", + optional=True, + ), + IO.Mask.Input( + "mask", + tooltip="Optional mask for inpainting (white areas will be replaced)", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text prompt for GPT Image 1", - }, - ), - }, - "optional": { - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2**31 - 1, - "step": 1, - "display": "number", - "control_after_generate": True, - "tooltip": "not implemented yet in backend", - }, - ), - "quality": ( - IO.COMBO, - { - "options": ["low", "medium", "high"], - "default": "low", - "tooltip": "Image quality, affects cost and generation time.", - }, - ), - "background": ( - IO.COMBO, - { - "options": ["opaque", "transparent"], - "default": "opaque", - "tooltip": "Return image with or without background", - }, - ), - "size": ( - IO.COMBO, - { - "options": ["auto", "1024x1024", "1024x1536", "1536x1024"], - "default": "auto", - "tooltip": "Image size", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 8, - "step": 1, - "display": "number", - "tooltip": "How many images to generate", - }, - ), - "image": ( - IO.IMAGE, - { - "default": None, - "tooltip": "Optional reference image for image editing.", - }, - ), - "mask": ( - IO.MASK, - { - "default": None, - "tooltip": "Optional mask for inpainting (white areas will be replaced)", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = (IO.IMAGE,) - FUNCTION = "api_call" - CATEGORY = "api node/image/OpenAI" - DESCRIPTION = cleandoc(__doc__ or "") - API_NODE = True - - async def api_call( - self, + async def execute( + cls, prompt, seed=0, quality="low", @@ -457,9 +425,7 @@ class OpenAIGPTImage1(ComfyNodeABC): mask=None, n=1, size="1024x1024", - unique_id=None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) model = "gpt-image-1" path = "/proxy/openai/images/generations" @@ -480,7 +446,7 @@ class OpenAIGPTImage1(ComfyNodeABC): image_np = (scaled_image.numpy() * 255).astype(np.uint8) img = Image.fromarray(image_np) - img_byte_arr = io.BytesIO() + img_byte_arr = BytesIO() img.save(img_byte_arr, format="PNG") img_byte_arr.seek(0) @@ -504,20 +470,17 @@ class OpenAIGPTImage1(ComfyNodeABC): mask_np = (scaled_mask.numpy() * 255).astype(np.uint8) mask_img = Image.fromarray(mask_np) - mask_img_byte_arr = io.BytesIO() + mask_img_byte_arr = BytesIO() mask_img.save(mask_img_byte_arr, format="PNG") mask_img_byte_arr.seek(0) files.append(("mask", ("mask.png", mask_img_byte_arr, "image/png"))) # Build the operation - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=request_class, - response_model=OpenAIImageGenerationResponse, - ), - request=request_class( + response = await sync_op( + cls, + ApiEndpoint(path=path, method="POST"), + response_model=OpenAIImageGenerationResponse, + data=request_class( model=model, prompt=prompt, quality=quality, @@ -528,127 +491,70 @@ class OpenAIGPTImage1(ComfyNodeABC): ), files=files if files else None, content_type=content_type, - auth_kwargs=kwargs, ) - response = await operation.execute() - - img_tensor = await validate_and_cast_response(response, node_id=unique_id) - return (img_tensor,) + return IO.NodeOutput(await validate_and_cast_response(response)) -class OpenAITextNode(ComfyNodeABC): - """ - Base class for OpenAI text generation nodes. - """ - - RETURN_TYPES = (IO.STRING,) - FUNCTION = "api_call" - CATEGORY = "api node/text/OpenAI" - API_NODE = True - - -class OpenAIChatNode(OpenAITextNode): +class OpenAIChatNode(IO.ComfyNode): """ Node to generate text responses from an OpenAI model. """ - def __init__(self) -> None: - """Initialize the chat node with a new session ID and empty history.""" - self.current_session_id: str = str(uuid.uuid4()) - self.history: dict[str, list[HistoryEntry]] = {} - self.previous_response_id: Optional[str] = None + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="OpenAIChatNode", + display_name="OpenAI ChatGPT", + category="api node/text/OpenAI", + description="Generate text responses from an OpenAI model.", + inputs=[ + IO.String.Input( + "prompt", + default="", + multiline=True, + tooltip="Text inputs to the model, used to generate a response.", + ), + IO.Boolean.Input( + "persist_context", + default=False, + tooltip="This parameter is deprecated and has no effect.", + ), + IO.Combo.Input( + "model", + options=SupportedOpenAIModel, + tooltip="The model used to generate the response", + ), + IO.Image.Input( + "images", + tooltip="Optional image(s) to use as context for the model. To include multiple images, you can use the Batch Images node.", + optional=True, + ), + IO.Custom("OPENAI_INPUT_FILES").Input( + "files", + optional=True, + tooltip="Optional file(s) to use as context for the model. Accepts inputs from the OpenAI Chat Input Files node.", + ), + IO.Custom("OPENAI_CHAT_CONFIG").Input( + "advanced_options", + optional=True, + tooltip="Optional configuration for the model. Accepts inputs from the OpenAI Chat Advanced Options node.", + ), + ], + outputs=[ + IO.String.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text inputs to the model, used to generate a response.", - }, - ), - "persist_context": ( - IO.BOOLEAN, - { - "default": True, - "tooltip": "Persist chat context between calls (multi-turn conversation)", - }, - ), - "model": model_field_to_node_input( - IO.COMBO, - OpenAICreateResponse, - "model", - enum_type=SupportedOpenAIModel, - ), - }, - "optional": { - "images": ( - IO.IMAGE, - { - "default": None, - "tooltip": "Optional image(s) to use as context for the model. To include multiple images, you can use the Batch Images node.", - }, - ), - "files": ( - "OPENAI_INPUT_FILES", - { - "default": None, - "tooltip": "Optional file(s) to use as context for the model. Accepts inputs from the OpenAI Chat Input Files node.", - }, - ), - "advanced_options": ( - "OPENAI_CHAT_CONFIG", - { - "default": None, - "tooltip": "Optional configuration for the model. Accepts inputs from the OpenAI Chat Advanced Options node.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - DESCRIPTION = "Generate text responses from an OpenAI model." - - async def get_result_response( - self, - response_id: str, - include: Optional[list[Includable]] = None, - auth_kwargs: Optional[dict[str, str]] = None, - ) -> OpenAIResponse: - """ - Retrieve a model response with the given ID from the OpenAI API. - - Args: - response_id (str): The ID of the response to retrieve. - include (Optional[List[Includable]]): Additional fields to include - in the response. See the `include` parameter for Response - creation above for more information. - - """ - return await PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"{RESPONSES_ENDPOINT}/{response_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=OpenAIResponse, - query_params={"include": include}, - ), - completed_statuses=["completed"], - failed_statuses=["failed"], - status_extractor=lambda response: response.status, - auth_kwargs=auth_kwargs, - ).execute() - def get_message_content_from_response( - self, response: OpenAIResponse + cls, response: OpenAIResponse ) -> list[OutputContent]: """Extract message content from the API response.""" for output in response.output: @@ -656,8 +562,9 @@ class OpenAIChatNode(OpenAITextNode): return output.root.content raise TypeError("No output message found in response") + @classmethod def get_text_from_message_content( - self, message_content: list[OutputContent] + cls, message_content: list[OutputContent] ) -> str: """Extract text content from message content.""" for content_item in message_content: @@ -665,58 +572,9 @@ class OpenAIChatNode(OpenAITextNode): return str(content_item.root.text) return "No text output found in response" - def get_history_text(self, session_id: str) -> str: - """Convert the entire history for a given session to JSON string.""" - return json.dumps(self.history[session_id]) - - def display_history_on_node(self, session_id: str, node_id: str) -> None: - """Display formatted chat history on the node UI.""" - render_spec = { - "node_id": node_id, - "component": "ChatHistoryWidget", - "props": { - "history": self.get_history_text(session_id), - }, - } - PromptServer.instance.send_sync( - "display_component", - render_spec, - ) - - def add_to_history( - self, session_id: str, prompt: str, output_text: str, response_id: str - ) -> None: - """Add a new entry to the chat history.""" - if session_id not in self.history: - self.history[session_id] = [] - self.history[session_id].append( - { - "prompt": prompt, - "response": output_text, - "response_id": response_id, - "timestamp": time.time(), - } - ) - - def parse_output_text_from_response(self, response: OpenAIResponse) -> str: - """Extract text output from the API response.""" - message_contents = self.get_message_content_from_response(response) - return self.get_text_from_message_content(message_contents) - - def generate_new_session_id(self) -> str: - """Generate a new unique session ID.""" - return str(uuid.uuid4()) - - def get_session_id(self, persist_context: bool) -> str: - """Get the current or generate a new session ID based on context persistence.""" - return ( - self.current_session_id - if persist_context - else self.generate_new_session_id() - ) - + @classmethod def tensor_to_input_image_content( - self, image: torch.Tensor, detail_level: Detail = "auto" + cls, image: torch.Tensor, detail_level: Detail = "auto" ) -> InputImageContent: """Convert a tensor to an input image content object.""" return InputImageContent( @@ -725,21 +583,27 @@ class OpenAIChatNode(OpenAITextNode): type="input_image", ) + @classmethod def create_input_message_contents( - self, + cls, prompt: str, image: Optional[torch.Tensor] = None, files: Optional[list[InputFileContent]] = None, ) -> InputMessageContentList: """Create a list of input message contents from prompt and optional image.""" - content_list: list[InputContent] = [ + content_list: list[Union[InputContent, InputTextContent, InputImageContent, InputFileContent]] = [ InputTextContent(text=prompt, type="input_text"), ] if image is not None: for i in range(image.shape[0]): content_list.append( - self.tensor_to_input_image_content(image[i].unsqueeze(0)) + InputImageContent( + detail="auto", + image_url=f"data:image/png;base64,{tensor_to_base64_string(image[i].unsqueeze(0))}", + type="input_image", + ) ) + if files is not None: content_list.extend(files) @@ -747,80 +611,28 @@ class OpenAIChatNode(OpenAITextNode): root=content_list, ) - def parse_response_id_from_prompt(self, prompt: str) -> Optional[str]: - """Extract response ID from prompt if it exists.""" - parsed_id = re.search(STARTING_POINT_ID_PATTERN, prompt) - return parsed_id.group(1) if parsed_id else None - - def strip_response_tag_from_prompt(self, prompt: str) -> str: - """Remove the response ID tag from the prompt.""" - return re.sub(STARTING_POINT_ID_PATTERN, "", prompt.strip()) - - def delete_history_after_response_id( - self, new_start_id: str, session_id: str - ) -> None: - """Delete history entries after a specific response ID.""" - if session_id not in self.history: - return - - new_history = [] - i = 0 - while ( - i < len(self.history[session_id]) - and self.history[session_id][i]["response_id"] != new_start_id - ): - new_history.append(self.history[session_id][i]) - i += 1 - - # Since it's the new starting point (not the response being edited), we include it as well - if i < len(self.history[session_id]): - new_history.append(self.history[session_id][i]) - - self.history[session_id] = new_history - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, - persist_context: bool, - model: SupportedOpenAIModel, - unique_id: Optional[str] = None, + persist_context: bool = False, + model: SupportedOpenAIModel = SupportedOpenAIModel.gpt_5.value, images: Optional[torch.Tensor] = None, files: Optional[list[InputFileContent]] = None, advanced_options: Optional[CreateModelResponseProperties] = None, - **kwargs, - ) -> tuple[str]: - # Validate inputs + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) - session_id = self.get_session_id(persist_context) - response_id_override = self.parse_response_id_from_prompt(prompt) - if response_id_override: - is_starting_from_beginning = response_id_override == "start" - if is_starting_from_beginning: - self.history[session_id] = [] - previous_response_id = None - else: - previous_response_id = response_id_override - self.delete_history_after_response_id(response_id_override, session_id) - prompt = self.strip_response_tag_from_prompt(prompt) - elif persist_context: - previous_response_id = self.previous_response_id - else: - previous_response_id = None - # Create response - create_response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=RESPONSES_ENDPOINT, - method=HttpMethod.POST, - request_model=OpenAICreateResponse, - response_model=OpenAIResponse, - ), - request=OpenAICreateResponse( + create_response = await sync_op( + cls, + ApiEndpoint(path=RESPONSES_ENDPOINT, method="POST"), + response_model=OpenAIResponse, + data=OpenAICreateResponse( input=[ Item( root=InputMessage( - content=self.create_input_message_contents( + content=cls.create_input_message_contents( prompt, images, files ), role="user", @@ -830,36 +642,57 @@ class OpenAIChatNode(OpenAITextNode): store=True, stream=False, model=model, - previous_response_id=previous_response_id, + previous_response_id=None, **( advanced_options.model_dump(exclude_none=True) if advanced_options else {} ), ), - auth_kwargs=kwargs, - ).execute() + ) response_id = create_response.id # Get result output - result_response = await self.get_result_response(response_id, auth_kwargs=kwargs) - output_text = self.parse_output_text_from_response(result_response) + result_response = await poll_op( + cls, + ApiEndpoint(path=f"{RESPONSES_ENDPOINT}/{response_id}"), + response_model=OpenAIResponse, + status_extractor=lambda response: response.status, + completed_statuses=["incomplete", "completed"] + ) + output_text = cls.get_text_from_message_content(cls.get_message_content_from_response(result_response)) # Update history - self.add_to_history(session_id, prompt, output_text, response_id) - self.display_history_on_node(session_id, unique_id) - self.previous_response_id = response_id - - return (output_text,) + render_spec = { + "node_id": cls.hidden.unique_id, + "component": "ChatHistoryWidget", + "props": { + "history": json.dumps( + [ + { + "prompt": prompt, + "response": output_text, + "response_id": str(uuid.uuid4()), + "timestamp": time.time(), + } + ] + ), + }, + } + PromptServer.instance.send_sync( + "display_component", + render_spec, + ) + return IO.NodeOutput(output_text) -class OpenAIInputFiles(ComfyNodeABC): +class OpenAIInputFiles(IO.ComfyNode): """ Loads and formats input files for OpenAI API. """ @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: + def define_schema(cls): """ For details about the supported file input types, see: https://platform.openai.com/docs/guides/pdf-files?api-mode=responses @@ -874,97 +707,92 @@ class OpenAIInputFiles(ComfyNodeABC): ] input_files = sorted(input_files, key=lambda x: x.name) input_files = [f.name for f in input_files] - return { - "required": { - "file": ( - IO.COMBO, - { - "tooltip": "Input files to include as context for the model. Only accepts text (.txt) and PDF (.pdf) files for now.", - "options": input_files, - "default": input_files[0] if input_files else None, - }, + return IO.Schema( + node_id="OpenAIInputFiles", + display_name="OpenAI ChatGPT Input Files", + category="api node/text/OpenAI", + description="Loads and prepares input files (text, pdf, etc.) to include as inputs for the OpenAI Chat Node. The files will be read by the OpenAI model when generating a response. 🛈 TIP: Can be chained together with other OpenAI Input File nodes.", + inputs=[ + IO.Combo.Input( + "file", + options=input_files, + default=input_files[0] if input_files else None, + tooltip="Input files to include as context for the model. Only accepts text (.txt) and PDF (.pdf) files for now.", ), - }, - "optional": { - "OPENAI_INPUT_FILES": ( + IO.Custom("OPENAI_INPUT_FILES").Input( "OPENAI_INPUT_FILES", - { - "tooltip": "An optional additional file(s) to batch together with the file loaded from this node. Allows chaining of input files so that a single message can include multiple input files.", - "default": None, - }, + tooltip="An optional additional file(s) to batch together with the file loaded from this node. Allows chaining of input files so that a single message can include multiple input files.", + optional=True, ), - }, - } + ], + outputs=[ + IO.Custom("OPENAI_INPUT_FILES").Output(), + ], + ) - DESCRIPTION = "Loads and prepares input files (text, pdf, etc.) to include as inputs for the OpenAI Chat Node. The files will be read by the OpenAI model when generating a response. 🛈 TIP: Can be chained together with other OpenAI Input File nodes." - RETURN_TYPES = ("OPENAI_INPUT_FILES",) - FUNCTION = "prepare_files" - CATEGORY = "api node/text/OpenAI" - - def create_input_file_content(self, file_path: str) -> InputFileContent: + @classmethod + def create_input_file_content(cls, file_path: str) -> InputFileContent: return InputFileContent( file_data=text_filepath_to_data_uri(file_path), filename=os.path.basename(file_path), type="input_file", ) - def prepare_files( - self, file: str, OPENAI_INPUT_FILES: list[InputFileContent] = [] - ) -> tuple[list[InputFileContent]]: + @classmethod + def execute(cls, file: str, OPENAI_INPUT_FILES: list[InputFileContent] = []) -> IO.NodeOutput: """ Loads and formats input files for OpenAI API. """ file_path = folder_paths.get_annotated_filepath(file) - input_file_content = self.create_input_file_content(file_path) + input_file_content = cls.create_input_file_content(file_path) files = [input_file_content] + OPENAI_INPUT_FILES - return (files,) + return IO.NodeOutput(files) -class OpenAIChatConfig(ComfyNodeABC): +class OpenAIChatConfig(IO.ComfyNode): """Allows setting additional configuration for the OpenAI Chat Node.""" - RETURN_TYPES = ("OPENAI_CHAT_CONFIG",) - FUNCTION = "configure" - DESCRIPTION = ( - "Allows specifying advanced configuration options for the OpenAI Chat Nodes." - ) - CATEGORY = "api node/text/OpenAI" - @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "truncation": ( - IO.COMBO, - { - "options": ["auto", "disabled"], - "default": "auto", - "tooltip": "The truncation strategy to use for the model response. auto: If the context of this response and previous ones exceeds the model's context window size, the model will truncate the response to fit the context window by dropping input items in the middle of the conversation.disabled: If a model response will exceed the context window size for a model, the request will fail with a 400 error", - }, + def define_schema(cls): + return IO.Schema( + node_id="OpenAIChatConfig", + display_name="OpenAI ChatGPT Advanced Options", + category="api node/text/OpenAI", + description="Allows specifying advanced configuration options for the OpenAI Chat Nodes.", + inputs=[ + IO.Combo.Input( + "truncation", + options=["auto", "disabled"], + default="auto", + tooltip="The truncation strategy to use for the model response. auto: If the context of this response and previous ones exceeds the model's context window size, the model will truncate the response to fit the context window by dropping input items in the middle of the conversation.disabled: If a model response will exceed the context window size for a model, the request will fail with a 400 error", ), - }, - "optional": { - "max_output_tokens": model_field_to_node_input( - IO.INT, - OpenAICreateResponse, + IO.Int.Input( "max_output_tokens", min=16, default=4096, max=16384, tooltip="An upper bound for the number of tokens that can be generated for a response, including visible output tokens", + optional=True, ), - "instructions": model_field_to_node_input( - IO.STRING, OpenAICreateResponse, "instructions", multiline=True + IO.String.Input( + "instructions", + multiline=True, + optional=True, + tooltip="Instructions for the model on how to generate the response", ), - }, - } + ], + outputs=[ + IO.Custom("OPENAI_CHAT_CONFIG").Output(), + ], + ) - def configure( - self, + @classmethod + def execute( + cls, truncation: bool, instructions: Optional[str] = None, max_output_tokens: Optional[int] = None, - ) -> tuple[CreateModelResponseProperties]: + ) -> IO.NodeOutput: """ Configure advanced options for the OpenAI Chat Node. @@ -974,29 +802,27 @@ class OpenAIChatConfig(ComfyNodeABC): They are not exposed as inputs at all to avoid having to manually remove depending on model choice. """ - return ( + return IO.NodeOutput( CreateModelResponseProperties( instructions=instructions, truncation=truncation, max_output_tokens=max_output_tokens, - ), + ) ) -NODE_CLASS_MAPPINGS = { - "OpenAIDalle2": OpenAIDalle2, - "OpenAIDalle3": OpenAIDalle3, - "OpenAIGPTImage1": OpenAIGPTImage1, - "OpenAIChatNode": OpenAIChatNode, - "OpenAIInputFiles": OpenAIInputFiles, - "OpenAIChatConfig": OpenAIChatConfig, -} +class OpenAIExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + OpenAIDalle2, + OpenAIDalle3, + OpenAIGPTImage1, + OpenAIChatNode, + OpenAIInputFiles, + OpenAIChatConfig, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "OpenAIDalle2": "OpenAI DALL·E 2", - "OpenAIDalle3": "OpenAI DALL·E 3", - "OpenAIGPTImage1": "OpenAI GPT Image 1", - "OpenAIChatNode": "OpenAI ChatGPT", - "OpenAIInputFiles": "OpenAI ChatGPT Input Files", - "OpenAIChatConfig": "OpenAI ChatGPT Advanced Options", -} + +async def comfy_entrypoint() -> OpenAIExtension: + return OpenAIExtension() diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py index bbc71363a..21013b591 100644 --- a/comfy_api_nodes/util/__init__.py +++ b/comfy_api_nodes/util/__init__.py @@ -18,6 +18,8 @@ from .conversions import ( tensor_to_base64_string, tensor_to_bytesio, tensor_to_pil, + text_filepath_to_base64_string, + text_filepath_to_data_uri, trim_video, video_to_base64_string, ) @@ -75,6 +77,8 @@ __all__ = [ "tensor_to_base64_string", "tensor_to_bytesio", "tensor_to_pil", + "text_filepath_to_base64_string", + "text_filepath_to_data_uri", "trim_video", "video_to_base64_string", # Validation utilities diff --git a/comfy_api_nodes/util/conversions.py b/comfy_api_nodes/util/conversions.py index b59c2bd84..971dc57de 100644 --- a/comfy_api_nodes/util/conversions.py +++ b/comfy_api_nodes/util/conversions.py @@ -1,6 +1,7 @@ import base64 import logging import math +import mimetypes import uuid from io import BytesIO from typing import Optional @@ -12,7 +13,7 @@ from PIL import Image from comfy.utils import common_upscale from comfy_api.latest import Input, InputImpl -from comfy_api.util import VideoContainer, VideoCodec +from comfy_api.util import VideoCodec, VideoContainer from ._helpers import mimetype_to_extension @@ -451,3 +452,19 @@ def resize_mask_to_image( if not allow_gradient: mask = (mask > 0.5).float() return mask + + +def text_filepath_to_base64_string(filepath: str) -> str: + """Converts a text file to a base64 string.""" + with open(filepath, "rb") as f: + file_content = f.read() + return base64.b64encode(file_content).decode("utf-8") + + +def text_filepath_to_data_uri(filepath: str) -> str: + """Converts a text file to a data URI.""" + base64_string = text_filepath_to_base64_string(filepath) + mime_type, _ = mimetypes.guess_type(filepath) + if mime_type is None: + mime_type = "application/octet-stream" + return f"data:{mime_type};base64,{base64_string}" From 4e2110c794b187c9326d604e7c0b0a4fad81148a Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 3 Nov 2025 10:29:08 +0200 Subject: [PATCH 530/759] feat(Pika-API-nodes): use new API client (#10608) --- .../apis/{pika_defs.py => pika_api.py} | 0 comfy_api_nodes/nodes_pika.py | 181 ++++++------------ 2 files changed, 57 insertions(+), 124 deletions(-) rename comfy_api_nodes/apis/{pika_defs.py => pika_api.py} (100%) diff --git a/comfy_api_nodes/apis/pika_defs.py b/comfy_api_nodes/apis/pika_api.py similarity index 100% rename from comfy_api_nodes/apis/pika_defs.py rename to comfy_api_nodes/apis/pika_api.py diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py index 5bb406a3b..51148211b 100644 --- a/comfy_api_nodes/nodes_pika.py +++ b/comfy_api_nodes/nodes_pika.py @@ -7,24 +7,23 @@ from __future__ import annotations from io import BytesIO import logging -from typing import Optional, TypeVar +from typing import Optional import torch from typing_extensions import override from comfy_api.latest import ComfyExtension, IO from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoInput -from comfy_api_nodes.apis import pika_defs -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.apis import pika_api as pika_defs +from comfy_api_nodes.util import ( + validate_string, + download_url_to_video_output, + tensor_to_bytesio, ApiEndpoint, - EmptyRequest, - HttpMethod, - PollingOperation, - SynchronousOperation, + sync_op, + poll_op, ) -from comfy_api_nodes.util import validate_string, download_url_to_video_output, tensor_to_bytesio -R = TypeVar("R") PATH_PIKADDITIONS = "/proxy/pika/generate/pikadditions" PATH_PIKASWAPS = "/proxy/pika/generate/pikaswaps" @@ -40,28 +39,18 @@ PATH_VIDEO_GET = "/proxy/pika/videos" async def execute_task( - initial_operation: SynchronousOperation[R, pika_defs.PikaGenerateResponse], - auth_kwargs: Optional[dict[str, str]] = None, - node_id: Optional[str] = None, + task_id: str, + cls: type[IO.ComfyNode], ) -> IO.NodeOutput: - task_id = (await initial_operation.execute()).video_id - final_response: pika_defs.PikaVideoResponse = await PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"{PATH_VIDEO_GET}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=pika_defs.PikaVideoResponse, - ), - completed_statuses=["finished"], - failed_statuses=["failed", "cancelled"], + final_response: pika_defs.PikaVideoResponse = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_VIDEO_GET}/{task_id}"), + response_model=pika_defs.PikaVideoResponse, status_extractor=lambda response: (response.status.value if response.status else None), progress_extractor=lambda response: (response.progress if hasattr(response, "progress") else None), - auth_kwargs=auth_kwargs, - result_url_extractor=lambda response: (response.url if hasattr(response, "url") else None), - node_id=node_id, estimated_duration=60, max_poll_attempts=240, - ).execute() + ) if not final_response.url: error_msg = f"Pika task {task_id} succeeded but no video data found in response:\n{final_response}" logging.error(error_msg) @@ -124,23 +113,15 @@ class PikaImageToVideo(IO.ComfyNode): resolution=resolution, duration=duration, ) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_IMAGE_TO_VIDEO, - method=HttpMethod.POST, - request_model=pika_defs.PikaBodyGenerate22I2vGenerate22I2vPost, - response_model=pika_defs.PikaGenerateResponse, - ), - request=pika_request_data, + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_IMAGE_TO_VIDEO, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_request_data, files=pika_files, content_type="multipart/form-data", - auth_kwargs=auth, ) - return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) + return await execute_task(initial_operation.video_id, cls) class PikaTextToVideoNode(IO.ComfyNode): @@ -183,18 +164,11 @@ class PikaTextToVideoNode(IO.ComfyNode): duration: int, aspect_ratio: float, ) -> IO.NodeOutput: - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_TEXT_TO_VIDEO, - method=HttpMethod.POST, - request_model=pika_defs.PikaBodyGenerate22T2vGenerate22T2vPost, - response_model=pika_defs.PikaGenerateResponse, - ), - request=pika_defs.PikaBodyGenerate22T2vGenerate22T2vPost( + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_TEXT_TO_VIDEO, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_defs.PikaBodyGenerate22T2vGenerate22T2vPost( promptText=prompt_text, negativePrompt=negative_prompt, seed=seed, @@ -202,10 +176,9 @@ class PikaTextToVideoNode(IO.ComfyNode): duration=duration, aspectRatio=aspect_ratio, ), - auth_kwargs=auth, content_type="application/x-www-form-urlencoded", ) - return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) + return await execute_task(initial_operation.video_id, cls) class PikaScenes(IO.ComfyNode): @@ -309,24 +282,16 @@ class PikaScenes(IO.ComfyNode): duration=duration, aspectRatio=aspect_ratio, ) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_PIKASCENES, - method=HttpMethod.POST, - request_model=pika_defs.PikaBodyGenerate22C2vGenerate22PikascenesPost, - response_model=pika_defs.PikaGenerateResponse, - ), - request=pika_request_data, + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_PIKASCENES, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_request_data, files=pika_files, content_type="multipart/form-data", - auth_kwargs=auth, ) - return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) + return await execute_task(initial_operation.video_id, cls) class PikAdditionsNode(IO.ComfyNode): @@ -383,24 +348,16 @@ class PikAdditionsNode(IO.ComfyNode): negativePrompt=negative_prompt, seed=seed, ) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_PIKADDITIONS, - method=HttpMethod.POST, - request_model=pika_defs.PikaBodyGeneratePikadditionsGeneratePikadditionsPost, - response_model=pika_defs.PikaGenerateResponse, - ), - request=pika_request_data, + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_PIKADDITIONS, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_request_data, files=pika_files, content_type="multipart/form-data", - auth_kwargs=auth, ) - return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) + return await execute_task(initial_operation.video_id, cls) class PikaSwapsNode(IO.ComfyNode): @@ -472,23 +429,15 @@ class PikaSwapsNode(IO.ComfyNode): seed=seed, modifyRegionRoi=region_to_modify if region_to_modify else None, ) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_PIKASWAPS, - method=HttpMethod.POST, - request_model=pika_defs.PikaBodyGeneratePikaswapsGeneratePikaswapsPost, - response_model=pika_defs.PikaGenerateResponse, - ), - request=pika_request_data, + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_PIKASWAPS, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_request_data, files=pika_files, content_type="multipart/form-data", - auth_kwargs=auth, ) - return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) + return await execute_task(initial_operation.video_id, cls) class PikaffectsNode(IO.ComfyNode): @@ -528,18 +477,11 @@ class PikaffectsNode(IO.ComfyNode): negative_prompt: str, seed: int, ) -> IO.NodeOutput: - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_PIKAFFECTS, - method=HttpMethod.POST, - request_model=pika_defs.PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - response_model=pika_defs.PikaGenerateResponse, - ), - request=pika_defs.PikaBodyGeneratePikaffectsGeneratePikaffectsPost( + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_PIKAFFECTS, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_defs.PikaBodyGeneratePikaffectsGeneratePikaffectsPost( pikaffect=pikaffect, promptText=prompt_text, negativePrompt=negative_prompt, @@ -547,9 +489,8 @@ class PikaffectsNode(IO.ComfyNode): ), files={"image": ("image.png", tensor_to_bytesio(image), "image/png")}, content_type="multipart/form-data", - auth_kwargs=auth, ) - return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) + return await execute_task(initial_operation.video_id, cls) class PikaStartEndFrameNode(IO.ComfyNode): @@ -592,18 +533,11 @@ class PikaStartEndFrameNode(IO.ComfyNode): ("keyFrames", ("image_start.png", tensor_to_bytesio(image_start), "image/png")), ("keyFrames", ("image_end.png", tensor_to_bytesio(image_end), "image/png")), ] - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_PIKAFRAMES, - method=HttpMethod.POST, - request_model=pika_defs.PikaBodyGenerate22KeyframeGenerate22PikaframesPost, - response_model=pika_defs.PikaGenerateResponse, - ), - request=pika_defs.PikaBodyGenerate22KeyframeGenerate22PikaframesPost( + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_PIKAFRAMES, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_defs.PikaBodyGenerate22KeyframeGenerate22PikaframesPost( promptText=prompt_text, negativePrompt=negative_prompt, seed=seed, @@ -612,9 +546,8 @@ class PikaStartEndFrameNode(IO.ComfyNode): ), files=pika_files, content_type="multipart/form-data", - auth_kwargs=auth, ) - return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id) + return await execute_task(initial_operation.video_id, cls) class PikaApiNodesExtension(ComfyExtension): From e974e554ca23be505b72bc9c1614f4285c1db6e3 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 4 Nov 2025 02:59:44 +0800 Subject: [PATCH 531/759] chore: update embedded docs to v0.3.1 (#10614) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4d84b0d3e..856e373de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ comfyui-frontend-package==1.28.8 comfyui-workflow-templates==0.2.4 -comfyui-embedded-docs==0.3.0 +comfyui-embedded-docs==0.3.1 torch torchsde torchvision From 958a17199ac519504e390ea0d53295ceb8cbd2c1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 3 Nov 2025 14:08:30 -0800 Subject: [PATCH 532/759] People should update their pytorch versions. (#10618) --- comfy/quant_ops.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index 873f173ed..5af6f118e 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -418,6 +418,10 @@ def fp8_linear(func, args, kwargs): scale_b=scale_b, out_dtype=out_dtype, ) + + if isinstance(output, tuple): # TODO: remove when we drop support for torch 2.4 + output = output[0] + if not tensor_2d: output = output.reshape((-1, input_shape[1], weight.shape[0])) From 0652cb8e2d343f68e38285755835c77bda7f6389 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 3 Nov 2025 14:37:12 -0800 Subject: [PATCH 533/759] Speed up torch.compile (#10620) --- comfy/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/comfy/ops.py b/comfy/ops.py index 0c8f23848..afe498caa 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -71,7 +71,6 @@ def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) -@torch.compiler.disable() def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False): # NOTE: offloadable=False is a a legacy and if you are a custom node author reading this please pass # offloadable=True and call uncast_bias_weight() after your last usage of the weight/bias. This From e199c8cc6758d388792fd66b99e8de832814ff91 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 3 Nov 2025 14:58:24 -0800 Subject: [PATCH 534/759] Fixes (#10621) --- comfy/quant_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index 5af6f118e..835fc4b8d 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -131,7 +131,7 @@ class QuantizedTensor(torch.Tensor): self._layout_params = layout_params def __repr__(self): - layout_name = self._layout_type.__name__ + layout_name = self._layout_type param_str = ", ".join(f"{k}={v}" for k, v in list(self._layout_params.items())[:2]) return f"QuantizedTensor(shape={self.shape}, layout={layout_name}, {param_str})" @@ -179,7 +179,7 @@ class QuantizedTensor(torch.Tensor): attr_name = f"_layout_param_{key}" layout_params[key] = inner_tensors[attr_name] - return QuantizedTensor(inner_tensors["_q_data"], layout_type, layout_params) + return QuantizedTensor(inner_tensors["_qdata"], layout_type, layout_params) @classmethod def from_float(cls, tensor, layout_type, **quantize_kwargs) -> 'QuantizedTensor': From 6b88478f9fe0874c0e17468c9fca3a0a84e6c781 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 3 Nov 2025 16:22:10 -0800 Subject: [PATCH 535/759] Bring back fp8 torch compile performance to what it should be. (#10622) --- comfy/quant_ops.py | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index 835fc4b8d..ed7b29963 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -126,7 +126,7 @@ class QuantizedTensor(torch.Tensor): return torch.Tensor._make_wrapper_subclass(cls, qdata.shape, device=qdata.device, dtype=qdata.dtype, requires_grad=False) def __init__(self, qdata, layout_type, layout_params): - self._qdata = qdata.contiguous() + self._qdata = qdata self._layout_type = layout_type self._layout_params = layout_params @@ -411,7 +411,7 @@ def fp8_linear(func, args, kwargs): try: output = torch._scaled_mm( - plain_input.reshape(-1, input_shape[2]), + plain_input.reshape(-1, input_shape[2]).contiguous(), weight_t, bias=bias, scale_a=scale_a, @@ -447,6 +447,43 @@ def fp8_linear(func, args, kwargs): return torch.nn.functional.linear(input_tensor, weight, bias) +@register_layout_op(torch.ops.aten.addmm.default, "TensorCoreFP8Layout") +def fp8_addmm(func, args, kwargs): + input_tensor = args[1] + weight = args[2] + bias = args[0] + + if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor): + out_dtype = kwargs.get("out_dtype") + if out_dtype is None: + out_dtype = input_tensor._layout_params['orig_dtype'] + + plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor) + plain_weight, scale_b = TensorCoreFP8Layout.get_plain_tensors(weight) + + output = torch._scaled_mm( + plain_input.contiguous(), + plain_weight, + bias=bias, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=out_dtype, + ) + + if isinstance(output, tuple): # TODO: remove when we drop support for torch 2.4 + output = output[0] + return output + + a = list(args) + if isinstance(args[0], QuantizedTensor): + a[0] = args[0].dequantize() + if isinstance(args[1], QuantizedTensor): + a[1] = args[1].dequantize() + if isinstance(args[2], QuantizedTensor): + a[2] = args[2].dequantize() + + return func(*a, **kwargs) + @register_layout_op(torch.ops.aten.view.default, "TensorCoreFP8Layout") @register_layout_op(torch.ops.aten.t.default, "TensorCoreFP8Layout") def fp8_func(func, args, kwargs): From 0f4ef3afa0772ad11d6d72ad21fb1e089c2fcf5f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 3 Nov 2025 18:47:14 -0800 Subject: [PATCH 536/759] This seems to slow things down slightly on Linux. (#10624) --- comfy/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ops.py b/comfy/ops.py index afe498caa..733bff99d 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -35,7 +35,7 @@ def scaled_dot_product_attention(q, k, v, *args, **kwargs): try: - if torch.cuda.is_available(): + if torch.cuda.is_available() and comfy.model_management.WINDOWS: from torch.nn.attention import SDPBackend, sdpa_kernel import inspect if "set_priority" in inspect.signature(sdpa_kernel).parameters: From af4b7b5edb339a15aa443e32aefbceac1810baa0 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 3 Nov 2025 19:14:20 -0800 Subject: [PATCH 537/759] More fp8 torch.compile regressions fixed. (#10625) --- comfy/quant_ops.py | 54 ++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index ed7b29963..c56e32a73 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -446,6 +446,25 @@ def fp8_linear(func, args, kwargs): return torch.nn.functional.linear(input_tensor, weight, bias) +def fp8_mm_(input_tensor, weight, bias=None, out_dtype=None): + if out_dtype is None: + out_dtype = input_tensor._layout_params['orig_dtype'] + + plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor) + plain_weight, scale_b = TensorCoreFP8Layout.get_plain_tensors(weight) + + output = torch._scaled_mm( + plain_input.contiguous(), + plain_weight, + bias=bias, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=out_dtype, + ) + + if isinstance(output, tuple): # TODO: remove when we drop support for torch 2.4 + output = output[0] + return output @register_layout_op(torch.ops.aten.addmm.default, "TensorCoreFP8Layout") def fp8_addmm(func, args, kwargs): @@ -454,25 +473,7 @@ def fp8_addmm(func, args, kwargs): bias = args[0] if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor): - out_dtype = kwargs.get("out_dtype") - if out_dtype is None: - out_dtype = input_tensor._layout_params['orig_dtype'] - - plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor) - plain_weight, scale_b = TensorCoreFP8Layout.get_plain_tensors(weight) - - output = torch._scaled_mm( - plain_input.contiguous(), - plain_weight, - bias=bias, - scale_a=scale_a, - scale_b=scale_b, - out_dtype=out_dtype, - ) - - if isinstance(output, tuple): # TODO: remove when we drop support for torch 2.4 - output = output[0] - return output + return fp8_mm_(input_tensor, weight, bias=bias, out_dtype=kwargs.get("out_dtype", None)) a = list(args) if isinstance(args[0], QuantizedTensor): @@ -484,6 +485,21 @@ def fp8_addmm(func, args, kwargs): return func(*a, **kwargs) +@register_layout_op(torch.ops.aten.mm.default, "TensorCoreFP8Layout") +def fp8_mm(func, args, kwargs): + input_tensor = args[0] + weight = args[1] + + if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor): + return fp8_mm_(input_tensor, weight, bias=None, out_dtype=kwargs.get("out_dtype", None)) + + a = list(args) + if isinstance(args[0], QuantizedTensor): + a[0] = args[0].dequantize() + if isinstance(args[1], QuantizedTensor): + a[1] = args[1].dequantize() + return func(*a, **kwargs) + @register_layout_op(torch.ops.aten.view.default, "TensorCoreFP8Layout") @register_layout_op(torch.ops.aten.t.default, "TensorCoreFP8Layout") def fp8_func(func, args, kwargs): From 9c71a667904a049975531f2a7dd55f4a8fc92652 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 5 Nov 2025 02:51:53 +0800 Subject: [PATCH 538/759] chore: update workflow templates to v0.2.11 (#10634) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 856e373de..249c36dee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.28.8 -comfyui-workflow-templates==0.2.4 +comfyui-workflow-templates==0.2.11 comfyui-embedded-docs==0.3.1 torch torchsde From a389ee01bb7ba5174729906a7f85bd08b5c2cb87 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Wed, 5 Nov 2025 08:14:10 +1000 Subject: [PATCH 539/759] caching: Handle None outputs tuple case (#10637) --- comfy_execution/caching.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py index e077f78b0..326a279fc 100644 --- a/comfy_execution/caching.py +++ b/comfy_execution/caching.py @@ -399,6 +399,8 @@ class RAMPressureCache(LRUCache): ram_usage = RAM_CACHE_DEFAULT_RAM_USAGE def scan_list_for_ram_usage(outputs): nonlocal ram_usage + if outputs is None: + return for output in outputs: if isinstance(output, list): scan_list_for_ram_usage(output) From 7f3e4d486cd77c3ad30eb4714ec18bdaf29e2b5c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 4 Nov 2025 14:37:50 -0800 Subject: [PATCH 540/759] Limit amount of pinned memory on windows to prevent issues. (#10638) --- comfy/model_management.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 79c0dfdb4..0d040e55e 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1082,8 +1082,20 @@ def cast_to_device(tensor, device, dtype, copy=False): non_blocking = device_supports_non_blocking(device) return cast_to(tensor, dtype=dtype, device=device, non_blocking=non_blocking, copy=copy) + +PINNED_MEMORY = {} +TOTAL_PINNED_MEMORY = 0 +if PerformanceFeature.PinnedMem in args.fast: + if WINDOWS: + MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.45 # Windows limit is apparently 50% + else: + MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95 +else: + MAX_PINNED_MEMORY = -1 + def pin_memory(tensor): - if PerformanceFeature.PinnedMem not in args.fast: + global TOTAL_PINNED_MEMORY + if MAX_PINNED_MEMORY <= 0: return False if not is_nvidia(): @@ -1092,13 +1104,21 @@ def pin_memory(tensor): if not is_device_cpu(tensor.device): return False - if torch.cuda.cudart().cudaHostRegister(tensor.data_ptr(), tensor.numel() * tensor.element_size(), 1) == 0: + size = tensor.numel() * tensor.element_size() + if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY: + return False + + ptr = tensor.data_ptr() + if torch.cuda.cudart().cudaHostRegister(ptr, size, 1) == 0: + PINNED_MEMORY[ptr] = size + TOTAL_PINNED_MEMORY += size return True return False def unpin_memory(tensor): - if PerformanceFeature.PinnedMem not in args.fast: + global TOTAL_PINNED_MEMORY + if MAX_PINNED_MEMORY <= 0: return False if not is_nvidia(): @@ -1107,7 +1127,11 @@ def unpin_memory(tensor): if not is_device_cpu(tensor.device): return False - if torch.cuda.cudart().cudaHostUnregister(tensor.data_ptr()) == 0: + ptr = tensor.data_ptr() + if torch.cuda.cudart().cudaHostUnregister(ptr) == 0: + TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr) + if len(PINNED_MEMORY) == 0: + TOTAL_PINNED_MEMORY = 0 return True return False From 265adad858e1f31b66cd3523a02b16f5d34ced52 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 4 Nov 2025 19:42:23 -0500 Subject: [PATCH 541/759] ComfyUI version v0.3.68 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index db48b05c4..25d1a4157 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.67" +__version__ = "0.3.68" diff --git a/pyproject.toml b/pyproject.toml index ab054355c..79ff3f74a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.67" +version = "0.3.68" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 4cd881866bad0cde70273cc123d725693c1f2759 Mon Sep 17 00:00:00 2001 From: contentis Date: Wed, 5 Nov 2025 02:10:11 +0100 Subject: [PATCH 542/759] Use single apply_rope function across models (#10547) --- comfy/ldm/flux/layers.py | 4 +- comfy/ldm/flux/math.py | 10 +--- comfy/ldm/lightricks/model.py | 88 ++++++++++++++--------------------- comfy/ldm/qwen_image/model.py | 36 +++++++------- comfy/ldm/wan/model.py | 1 + 5 files changed, 59 insertions(+), 80 deletions(-) diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py index ef21b416b..a3eab0470 100644 --- a/comfy/ldm/flux/layers.py +++ b/comfy/ldm/flux/layers.py @@ -195,8 +195,8 @@ class DoubleStreamBlock(nn.Module): txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:] # calculate the img bloks - img = img + apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img) - img = img + apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img) + img += apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img) + img += apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img) # calculate the txt bloks txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt) diff --git a/comfy/ldm/flux/math.py b/comfy/ldm/flux/math.py index 8deda0d4a..158420290 100644 --- a/comfy/ldm/flux/math.py +++ b/comfy/ldm/flux/math.py @@ -7,15 +7,7 @@ import comfy.model_management def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor: - q_shape = q.shape - k_shape = k.shape - - if pe is not None: - q = q.to(dtype=pe.dtype).reshape(*q.shape[:-1], -1, 1, 2) - k = k.to(dtype=pe.dtype).reshape(*k.shape[:-1], -1, 1, 2) - q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v) - k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v) - + q, k = apply_rope(q, k, pe) heads = q.shape[1] x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask, transformer_options=transformer_options) return x diff --git a/comfy/ldm/lightricks/model.py b/comfy/ldm/lightricks/model.py index def365ba7..5bcba998b 100644 --- a/comfy/ldm/lightricks/model.py +++ b/comfy/ldm/lightricks/model.py @@ -3,12 +3,11 @@ from torch import nn import comfy.patcher_extension import comfy.ldm.modules.attention import comfy.ldm.common_dit -from einops import rearrange import math from typing import Dict, Optional, Tuple from .symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords - +from comfy.ldm.flux.math import apply_rope1 def get_timestep_embedding( timesteps: torch.Tensor, @@ -238,20 +237,6 @@ class FeedForward(nn.Module): return self.net(x) -def apply_rotary_emb(input_tensor, freqs_cis): #TODO: remove duplicate funcs and pick the best/fastest one - cos_freqs = freqs_cis[0] - sin_freqs = freqs_cis[1] - - t_dup = rearrange(input_tensor, "... (d r) -> ... d r", r=2) - t1, t2 = t_dup.unbind(dim=-1) - t_dup = torch.stack((-t2, t1), dim=-1) - input_tensor_rot = rearrange(t_dup, "... d r -> ... (d r)") - - out = input_tensor * cos_freqs + input_tensor_rot * sin_freqs - - return out - - class CrossAttention(nn.Module): def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., attn_precision=None, dtype=None, device=None, operations=None): super().__init__() @@ -281,8 +266,8 @@ class CrossAttention(nn.Module): k = self.k_norm(k) if pe is not None: - q = apply_rotary_emb(q, pe) - k = apply_rotary_emb(k, pe) + q = apply_rope1(q.unsqueeze(1), pe).squeeze(1) + k = apply_rope1(k.unsqueeze(1), pe).squeeze(1) if mask is None: out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision, transformer_options=transformer_options) @@ -306,12 +291,17 @@ class BasicTransformerBlock(nn.Module): def forward(self, x, context=None, attention_mask=None, timestep=None, pe=None, transformer_options={}): shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None, None].to(device=x.device, dtype=x.dtype) + timestep.reshape(x.shape[0], timestep.shape[1], self.scale_shift_table.shape[0], -1)).unbind(dim=2) - x += self.attn1(comfy.ldm.common_dit.rms_norm(x) * (1 + scale_msa) + shift_msa, pe=pe, transformer_options=transformer_options) * gate_msa + norm_x = comfy.ldm.common_dit.rms_norm(x) + attn1_input = torch.addcmul(norm_x, norm_x, scale_msa).add_(shift_msa) + attn1_result = self.attn1(attn1_input, pe=pe, transformer_options=transformer_options) + x.addcmul_(attn1_result, gate_msa) x += self.attn2(x, context=context, mask=attention_mask, transformer_options=transformer_options) - y = comfy.ldm.common_dit.rms_norm(x) * (1 + scale_mlp) + shift_mlp - x += self.ff(y) * gate_mlp + norm_x = comfy.ldm.common_dit.rms_norm(x) + y = torch.addcmul(norm_x, norm_x, scale_mlp).add_(shift_mlp) + ff_result = self.ff(y) + x.addcmul_(ff_result, gate_mlp) return x @@ -327,41 +317,35 @@ def get_fractional_positions(indices_grid, max_pos): def precompute_freqs_cis(indices_grid, dim, out_dtype, theta=10000.0, max_pos=[20, 2048, 2048]): - dtype = torch.float32 #self.dtype + dtype = torch.float32 + device = indices_grid.device + # Get fractional positions and compute frequency indices fractional_positions = get_fractional_positions(indices_grid, max_pos) + indices = theta ** torch.linspace(0, 1, dim // 6, device=device, dtype=dtype) * math.pi / 2 - start = 1 - end = theta - device = fractional_positions.device + # Compute frequencies and apply cos/sin + freqs = (indices * (fractional_positions.unsqueeze(-1) * 2 - 1)).transpose(-1, -2).flatten(2) + cos_vals = freqs.cos().repeat_interleave(2, dim=-1) + sin_vals = freqs.sin().repeat_interleave(2, dim=-1) - indices = theta ** ( - torch.linspace( - math.log(start, theta), - math.log(end, theta), - dim // 6, - device=device, - dtype=dtype, - ) - ) - indices = indices.to(dtype=dtype) - - indices = indices * math.pi / 2 - - freqs = ( - (indices * (fractional_positions.unsqueeze(-1) * 2 - 1)) - .transpose(-1, -2) - .flatten(2) - ) - - cos_freq = freqs.cos().repeat_interleave(2, dim=-1) - sin_freq = freqs.sin().repeat_interleave(2, dim=-1) + # Pad if dim is not divisible by 6 if dim % 6 != 0: - cos_padding = torch.ones_like(cos_freq[:, :, : dim % 6]) - sin_padding = torch.zeros_like(cos_freq[:, :, : dim % 6]) - cos_freq = torch.cat([cos_padding, cos_freq], dim=-1) - sin_freq = torch.cat([sin_padding, sin_freq], dim=-1) - return cos_freq.to(out_dtype), sin_freq.to(out_dtype) + padding_size = dim % 6 + cos_vals = torch.cat([torch.ones_like(cos_vals[:, :, :padding_size]), cos_vals], dim=-1) + sin_vals = torch.cat([torch.zeros_like(sin_vals[:, :, :padding_size]), sin_vals], dim=-1) + + # Reshape and extract one value per pair (since repeat_interleave duplicates each value) + cos_vals = cos_vals.reshape(*cos_vals.shape[:2], -1, 2)[..., 0] # [B, N, dim//2] + sin_vals = sin_vals.reshape(*sin_vals.shape[:2], -1, 2)[..., 0] # [B, N, dim//2] + + # Build rotation matrix [[cos, -sin], [sin, cos]] and add heads dimension + freqs_cis = torch.stack([ + torch.stack([cos_vals, -sin_vals], dim=-1), + torch.stack([sin_vals, cos_vals], dim=-1) + ], dim=-2).unsqueeze(1) # [B, 1, N, dim//2, 2, 2] + + return freqs_cis.to(out_dtype) class LTXVModel(torch.nn.Module): @@ -501,7 +485,7 @@ class LTXVModel(torch.nn.Module): shift, scale = scale_shift_values[:, :, 0], scale_shift_values[:, :, 1] x = self.norm_out(x) # Modulation - x = x * (1 + scale) + shift + x = torch.addcmul(x, x, scale).add_(shift) x = self.proj_out(x) x = self.patchifier.unpatchify( diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index b9f60c2b7..81d3ee7c0 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -10,6 +10,7 @@ from comfy.ldm.modules.attention import optimized_attention_masked from comfy.ldm.flux.layers import EmbedND import comfy.ldm.common_dit import comfy.patcher_extension +from comfy.ldm.flux.math import apply_rope1 class GELU(nn.Module): def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True, dtype=None, device=None, operations=None): @@ -134,33 +135,34 @@ class Attention(nn.Module): image_rotary_emb: Optional[torch.Tensor] = None, transformer_options={}, ) -> Tuple[torch.Tensor, torch.Tensor]: + batch_size = hidden_states.shape[0] + seq_img = hidden_states.shape[1] seq_txt = encoder_hidden_states.shape[1] - img_query = self.to_q(hidden_states).unflatten(-1, (self.heads, -1)) - img_key = self.to_k(hidden_states).unflatten(-1, (self.heads, -1)) - img_value = self.to_v(hidden_states).unflatten(-1, (self.heads, -1)) + # Project and reshape to BHND format (batch, heads, seq, dim) + img_query = self.to_q(hidden_states).view(batch_size, seq_img, self.heads, -1).transpose(1, 2).contiguous() + img_key = self.to_k(hidden_states).view(batch_size, seq_img, self.heads, -1).transpose(1, 2).contiguous() + img_value = self.to_v(hidden_states).view(batch_size, seq_img, self.heads, -1).transpose(1, 2) - txt_query = self.add_q_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1)) - txt_key = self.add_k_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1)) - txt_value = self.add_v_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1)) + txt_query = self.add_q_proj(encoder_hidden_states).view(batch_size, seq_txt, self.heads, -1).transpose(1, 2).contiguous() + txt_key = self.add_k_proj(encoder_hidden_states).view(batch_size, seq_txt, self.heads, -1).transpose(1, 2).contiguous() + txt_value = self.add_v_proj(encoder_hidden_states).view(batch_size, seq_txt, self.heads, -1).transpose(1, 2) img_query = self.norm_q(img_query) img_key = self.norm_k(img_key) txt_query = self.norm_added_q(txt_query) txt_key = self.norm_added_k(txt_key) - joint_query = torch.cat([txt_query, img_query], dim=1) - joint_key = torch.cat([txt_key, img_key], dim=1) - joint_value = torch.cat([txt_value, img_value], dim=1) + joint_query = torch.cat([txt_query, img_query], dim=2) + joint_key = torch.cat([txt_key, img_key], dim=2) + joint_value = torch.cat([txt_value, img_value], dim=2) - joint_query = apply_rotary_emb(joint_query, image_rotary_emb) - joint_key = apply_rotary_emb(joint_key, image_rotary_emb) + joint_query = apply_rope1(joint_query, image_rotary_emb) + joint_key = apply_rope1(joint_key, image_rotary_emb) - joint_query = joint_query.flatten(start_dim=2) - joint_key = joint_key.flatten(start_dim=2) - joint_value = joint_value.flatten(start_dim=2) - - joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, attention_mask, transformer_options=transformer_options) + joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, + attention_mask, transformer_options=transformer_options, + skip_reshape=True) txt_attn_output = joint_hidden_states[:, :seq_txt, :] img_attn_output = joint_hidden_states[:, seq_txt:, :] @@ -413,7 +415,7 @@ class QwenImageTransformer2DModel(nn.Module): txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2)) txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) ids = torch.cat((txt_ids, img_ids), dim=1) - image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) + image_rotary_emb = self.pe_embedder(ids).to(torch.float32).contiguous() del ids, txt_ids, img_ids hidden_states = self.img_in(hidden_states) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 5ec1511ce..a9d5e10d9 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -232,6 +232,7 @@ class WanAttentionBlock(nn.Module): # assert e[0].dtype == torch.float32 # self-attention + x = x.contiguous() # otherwise implicit in LayerNorm y = self.self_attn( torch.addcmul(repeat_e(e[0], x), self.norm1(x), 1 + repeat_e(e[1], x)), freqs, transformer_options=transformer_options) From c4a6b389de1014471a75a46ee57d2fdac4f8df93 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 4 Nov 2025 19:47:35 -0800 Subject: [PATCH 543/759] Lower ltxv mem usage to what it was before previous pr. (#10643) Bring back qwen behavior to what it was before previous pr. --- comfy/ldm/lightricks/model.py | 22 +++++++++++----------- comfy/ldm/qwen_image/model.py | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/comfy/ldm/lightricks/model.py b/comfy/ldm/lightricks/model.py index 5bcba998b..593f7940f 100644 --- a/comfy/ldm/lightricks/model.py +++ b/comfy/ldm/lightricks/model.py @@ -291,17 +291,17 @@ class BasicTransformerBlock(nn.Module): def forward(self, x, context=None, attention_mask=None, timestep=None, pe=None, transformer_options={}): shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None, None].to(device=x.device, dtype=x.dtype) + timestep.reshape(x.shape[0], timestep.shape[1], self.scale_shift_table.shape[0], -1)).unbind(dim=2) - norm_x = comfy.ldm.common_dit.rms_norm(x) - attn1_input = torch.addcmul(norm_x, norm_x, scale_msa).add_(shift_msa) - attn1_result = self.attn1(attn1_input, pe=pe, transformer_options=transformer_options) - x.addcmul_(attn1_result, gate_msa) + attn1_input = comfy.ldm.common_dit.rms_norm(x) + attn1_input = torch.addcmul(attn1_input, attn1_input, scale_msa).add_(shift_msa) + attn1_input = self.attn1(attn1_input, pe=pe, transformer_options=transformer_options) + x.addcmul_(attn1_input, gate_msa) + del attn1_input x += self.attn2(x, context=context, mask=attention_mask, transformer_options=transformer_options) - norm_x = comfy.ldm.common_dit.rms_norm(x) - y = torch.addcmul(norm_x, norm_x, scale_mlp).add_(shift_mlp) - ff_result = self.ff(y) - x.addcmul_(ff_result, gate_mlp) + y = comfy.ldm.common_dit.rms_norm(x) + y = torch.addcmul(y, y, scale_mlp).add_(shift_mlp) + x.addcmul_(self.ff(y), gate_mlp) return x @@ -336,8 +336,8 @@ def precompute_freqs_cis(indices_grid, dim, out_dtype, theta=10000.0, max_pos=[2 sin_vals = torch.cat([torch.zeros_like(sin_vals[:, :, :padding_size]), sin_vals], dim=-1) # Reshape and extract one value per pair (since repeat_interleave duplicates each value) - cos_vals = cos_vals.reshape(*cos_vals.shape[:2], -1, 2)[..., 0] # [B, N, dim//2] - sin_vals = sin_vals.reshape(*sin_vals.shape[:2], -1, 2)[..., 0] # [B, N, dim//2] + cos_vals = cos_vals.reshape(*cos_vals.shape[:2], -1, 2)[..., 0].to(out_dtype) # [B, N, dim//2] + sin_vals = sin_vals.reshape(*sin_vals.shape[:2], -1, 2)[..., 0].to(out_dtype) # [B, N, dim//2] # Build rotation matrix [[cos, -sin], [sin, cos]] and add heads dimension freqs_cis = torch.stack([ @@ -345,7 +345,7 @@ def precompute_freqs_cis(indices_grid, dim, out_dtype, theta=10000.0, max_pos=[2 torch.stack([sin_vals, cos_vals], dim=-1) ], dim=-2).unsqueeze(1) # [B, 1, N, dim//2, 2, 2] - return freqs_cis.to(out_dtype) + return freqs_cis class LTXVModel(torch.nn.Module): diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 81d3ee7c0..e5d0d17c1 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -415,7 +415,7 @@ class QwenImageTransformer2DModel(nn.Module): txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2)) txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) ids = torch.cat((txt_ids, img_ids), dim=1) - image_rotary_emb = self.pe_embedder(ids).to(torch.float32).contiguous() + image_rotary_emb = self.pe_embedder(ids).to(x.dtype).contiguous() del ids, txt_ids, img_ids hidden_states = self.img_in(hidden_states) From bda0eb2448135797d5a72f7236ce26d07e555baf Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 5 Nov 2025 12:16:00 +0200 Subject: [PATCH 544/759] feat(API-nodes): move Rodin3D nodes to new client; removed old api client.py (#10645) --- comfy_api_nodes/apis/PixverseController.py | 17 - comfy_api_nodes/apis/PixverseDto.py | 57 - comfy_api_nodes/apis/client.py | 981 ------------------ comfy_api_nodes/nodes_rodin.py | 196 ++-- comfy_api_nodes/util/client.py | 4 +- comfy_api_nodes/util/download_helpers.py | 2 +- .../{apis => util}/request_logger.py | 4 +- comfy_api_nodes/util/upload_helpers.py | 2 +- 8 files changed, 75 insertions(+), 1188 deletions(-) delete mode 100644 comfy_api_nodes/apis/PixverseController.py delete mode 100644 comfy_api_nodes/apis/PixverseDto.py delete mode 100644 comfy_api_nodes/apis/client.py rename comfy_api_nodes/{apis => util}/request_logger.py (100%) diff --git a/comfy_api_nodes/apis/PixverseController.py b/comfy_api_nodes/apis/PixverseController.py deleted file mode 100644 index 310c0f546..000000000 --- a/comfy_api_nodes/apis/PixverseController.py +++ /dev/null @@ -1,17 +0,0 @@ -# generated by datamodel-codegen: -# filename: filtered-openapi.yaml -# timestamp: 2025-04-29T23:44:54+00:00 - -from __future__ import annotations - -from typing import Optional - -from pydantic import BaseModel - -from . import PixverseDto - - -class ResponseData(BaseModel): - ErrCode: Optional[int] = None - ErrMsg: Optional[str] = None - Resp: Optional[PixverseDto.V2OpenAPII2VResp] = None diff --git a/comfy_api_nodes/apis/PixverseDto.py b/comfy_api_nodes/apis/PixverseDto.py deleted file mode 100644 index 323c38e96..000000000 --- a/comfy_api_nodes/apis/PixverseDto.py +++ /dev/null @@ -1,57 +0,0 @@ -# generated by datamodel-codegen: -# filename: filtered-openapi.yaml -# timestamp: 2025-04-29T23:44:54+00:00 - -from __future__ import annotations - -from typing import Optional - -from pydantic import BaseModel, Field - - -class V2OpenAPII2VResp(BaseModel): - video_id: Optional[int] = Field(None, description='Video_id') - - -class V2OpenAPIT2VReq(BaseModel): - aspect_ratio: str = Field( - ..., description='Aspect ratio (16:9, 4:3, 1:1, 3:4, 9:16)', examples=['16:9'] - ) - duration: int = Field( - ..., - description='Video duration (5, 8 seconds, --model=v3.5 only allows 5,8; --quality=1080p does not support 8s)', - examples=[5], - ) - model: str = Field( - ..., description='Model version (only supports v3.5)', examples=['v3.5'] - ) - motion_mode: Optional[str] = Field( - 'normal', - description='Motion mode (normal, fast, --fast only available when duration=5; --quality=1080p does not support fast)', - examples=['normal'], - ) - negative_prompt: Optional[str] = Field( - None, description='Negative prompt\n', max_length=2048 - ) - prompt: str = Field(..., description='Prompt', max_length=2048) - quality: str = Field( - ..., - description='Video quality ("360p"(Turbo model), "540p", "720p", "1080p")', - examples=['540p'], - ) - seed: Optional[int] = Field(None, description='Random seed, range: 0 - 2147483647') - style: Optional[str] = Field( - None, - description='Style (effective when model=v3.5, "anime", "3d_animation", "clay", "comic", "cyberpunk") Do not include style parameter unless needed', - examples=['anime'], - ) - template_id: Optional[int] = Field( - None, - description='Template ID (template_id must be activated before use)', - examples=[302325299692608], - ) - water_mark: Optional[bool] = Field( - False, - description='Watermark (true: add watermark, false: no watermark)', - examples=[False], - ) diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py deleted file mode 100644 index bdaddcc88..000000000 --- a/comfy_api_nodes/apis/client.py +++ /dev/null @@ -1,981 +0,0 @@ -""" -API Client Framework for api.comfy.org. - -This module provides a flexible framework for making API requests from ComfyUI nodes. -It supports both synchronous and asynchronous API operations with proper type validation. - -Key Components: --------------- -1. ApiClient - Handles HTTP requests with authentication and error handling -2. ApiEndpoint - Defines a single HTTP endpoint with its request/response models -3. ApiOperation - Executes a single synchronous API operation - -Usage Examples: --------------- - -# Example 1: Synchronous API Operation -# ------------------------------------ -# For a simple API call that returns the result immediately: - -# 1. Create the API client -api_client = ApiClient( - base_url="https://api.example.com", - auth_token="your_auth_token_here", - comfy_api_key="your_comfy_api_key_here", - timeout=30.0, - verify_ssl=True -) - -# 2. Define the endpoint -user_info_endpoint = ApiEndpoint( - path="/v1/users/me", - method=HttpMethod.GET, - request_model=EmptyRequest, # No request body needed - response_model=UserProfile, # Pydantic model for the response - query_params=None -) - -# 3. Create the request object -request = EmptyRequest() - -# 4. Create and execute the operation -operation = ApiOperation( - endpoint=user_info_endpoint, - request=request -) -user_profile = await operation.execute(client=api_client) # Returns immediately with the result - - -# Example 2: Asynchronous API Operation with Polling -# ------------------------------------------------- -# For an API that starts a task and requires polling for completion: - -# 1. Define the endpoints (initial request and polling) -generate_image_endpoint = ApiEndpoint( - path="/v1/images/generate", - method=HttpMethod.POST, - request_model=ImageGenerationRequest, - response_model=TaskCreatedResponse, - query_params=None -) - -check_task_endpoint = ApiEndpoint( - path="/v1/tasks/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=ImageGenerationResult, - query_params=None -) - -# 2. Create the request object -request = ImageGenerationRequest( - prompt="a beautiful sunset over mountains", - width=1024, - height=1024, - num_images=1 -) - -# 3. Create and execute the polling operation -operation = PollingOperation( - initial_endpoint=generate_image_endpoint, - initial_request=request, - poll_endpoint=check_task_endpoint, - task_id_field="task_id", - status_field="status", - completed_statuses=["completed"], - failed_statuses=["failed", "error"] -) - -# This will make the initial request and then poll until completion -result = await operation.execute(client=api_client) # Returns the final ImageGenerationResult when done -""" - -from __future__ import annotations -import aiohttp -import asyncio -import logging -import io -import os -import socket -from aiohttp.client_exceptions import ClientError, ClientResponseError -from typing import Type, Optional, Any, TypeVar, Generic, Callable -from enum import Enum -import json -from urllib.parse import urljoin, urlparse -from pydantic import BaseModel, Field -import uuid # For generating unique operation IDs - -from server import PromptServer -from comfy.cli_args import args -from comfy import utils -from . import request_logger - -T = TypeVar("T", bound=BaseModel) -R = TypeVar("R", bound=BaseModel) -P = TypeVar("P", bound=BaseModel) # For poll response - -PROGRESS_BAR_MAX = 100 - - -class NetworkError(Exception): - """Base exception for network-related errors with diagnostic information.""" - pass - - -class LocalNetworkError(NetworkError): - """Exception raised when local network connectivity issues are detected.""" - pass - - -class ApiServerError(NetworkError): - """Exception raised when the API server is unreachable but internet is working.""" - pass - - -class EmptyRequest(BaseModel): - """Base class for empty request bodies. - For GET requests, fields will be sent as query parameters.""" - - pass - - -class UploadRequest(BaseModel): - file_name: str = Field(..., description="Filename to upload") - content_type: Optional[str] = Field( - None, - description="Mime type of the file. For example: image/png, image/jpeg, video/mp4, etc.", - ) - - -class UploadResponse(BaseModel): - download_url: str = Field(..., description="URL to GET uploaded file") - upload_url: str = Field(..., description="URL to PUT file to upload") - - -class HttpMethod(str, Enum): - GET = "GET" - POST = "POST" - PUT = "PUT" - DELETE = "DELETE" - PATCH = "PATCH" - - -class ApiClient: - """ - Client for making HTTP requests to an API with authentication, error handling, and retry logic. - """ - - def __init__( - self, - base_url: str, - auth_token: Optional[str] = None, - comfy_api_key: Optional[str] = None, - timeout: float = 3600.0, - verify_ssl: bool = True, - max_retries: int = 3, - retry_delay: float = 1.0, - retry_backoff_factor: float = 2.0, - retry_status_codes: Optional[tuple[int, ...]] = None, - session: Optional[aiohttp.ClientSession] = None, - ): - self.base_url = base_url - self.auth_token = auth_token - self.comfy_api_key = comfy_api_key - self.timeout = timeout - self.verify_ssl = verify_ssl - self.max_retries = max_retries - self.retry_delay = retry_delay - self.retry_backoff_factor = retry_backoff_factor - # Default retry status codes: 408 (Request Timeout), 429 (Too Many Requests), - # 500, 502, 503, 504 (Server Errors) - self.retry_status_codes = retry_status_codes or (408, 429, 500, 502, 503, 504) - self._session: Optional[aiohttp.ClientSession] = session - self._owns_session = session is None # Track if we have to close it - - @staticmethod - def _generate_operation_id(path: str) -> str: - """Generates a unique operation ID for logging.""" - return f"{path.strip('/').replace('/', '_')}_{uuid.uuid4().hex[:8]}" - - @staticmethod - def _create_json_payload_args( - data: Optional[dict[str, Any]] = None, - headers: Optional[dict[str, str]] = None, - ) -> dict[str, Any]: - return { - "json": data, - "headers": headers, - } - - def _create_form_data_args( - self, - data: dict[str, Any] | None, - files: dict[str, Any] | None, - headers: Optional[dict[str, str]] = None, - multipart_parser: Callable | None = None, - ) -> dict[str, Any]: - if headers and "Content-Type" in headers: - del headers["Content-Type"] - - if multipart_parser and data: - data = multipart_parser(data) - - if isinstance(data, aiohttp.FormData): - form = data # If the parser already returned a FormData, pass it through - else: - form = aiohttp.FormData(default_to_multipart=True) - if data: # regular text fields - for k, v in data.items(): - if v is None: - continue # aiohttp fails to serialize "None" values - # aiohttp expects strings or bytes; convert enums etc. - form.add_field(k, str(v) if not isinstance(v, (bytes, bytearray)) else v) - - if files: - file_iter = files if isinstance(files, list) else files.items() - for field_name, file_obj in file_iter: - if file_obj is None: - continue # aiohttp fails to serialize "None" values - # file_obj can be (filename, bytes/io.BytesIO, content_type) tuple - if isinstance(file_obj, tuple): - filename, file_value, content_type = self._unpack_tuple(file_obj) - else: - file_value = file_obj - filename = getattr(file_obj, "name", field_name) - content_type = "application/octet-stream" - - form.add_field( - name=field_name, - value=file_value, - filename=filename, - content_type=content_type, - ) - return {"data": form, "headers": headers or {}} - - @staticmethod - def _create_urlencoded_form_data_args( - data: dict[str, Any], - headers: Optional[dict[str, str]] = None, - ) -> dict[str, Any]: - headers = headers or {} - headers["Content-Type"] = "application/x-www-form-urlencoded" - return { - "data": data, - "headers": headers, - } - - def get_headers(self) -> dict[str, str]: - """Get headers for API requests, including authentication if available""" - headers = {"Content-Type": "application/json", "Accept": "application/json"} - - if self.auth_token: - headers["Authorization"] = f"Bearer {self.auth_token}" - elif self.comfy_api_key: - headers["X-API-KEY"] = self.comfy_api_key - - return headers - - async def _check_connectivity(self, target_url: str) -> dict[str, bool]: - """ - Check connectivity to determine if network issues are local or server-related. - - Args: - target_url: URL to check connectivity to - - Returns: - Dictionary with connectivity status details - """ - results = { - "internet_accessible": False, - "api_accessible": False, - "is_local_issue": False, - "is_api_issue": False, - } - timeout = aiohttp.ClientTimeout(total=5.0) - async with aiohttp.ClientSession(timeout=timeout) as session: - try: - async with session.get("https://www.google.com", ssl=self.verify_ssl) as resp: - results["internet_accessible"] = resp.status < 500 - except (ClientError, asyncio.TimeoutError, socket.gaierror): - results["is_local_issue"] = True - return results # cannot reach the internet – early exit - - # Now check API health endpoint - parsed = urlparse(target_url) - health_url = f"{parsed.scheme}://{parsed.netloc}/health" - try: - async with session.get(health_url, ssl=self.verify_ssl) as resp: - results["api_accessible"] = resp.status < 500 - except ClientError: - pass # leave as False - - results["is_api_issue"] = results["internet_accessible"] and not results["api_accessible"] - return results - - async def request( - self, - method: str, - path: str, - params: Optional[dict[str, Any]] = None, - data: Optional[dict[str, Any]] = None, - files: Optional[dict[str, Any] | list[tuple[str, Any]]] = None, - headers: Optional[dict[str, str]] = None, - content_type: str = "application/json", - multipart_parser: Callable | None = None, - retry_count: int = 0, # Used internally for tracking retries - ) -> dict[str, Any]: - """ - Make an HTTP request to the API with automatic retries for transient errors. - - Args: - method: HTTP method (GET, POST, etc.) - path: API endpoint path (will be joined with base_url) - params: Query parameters - data: body data - files: Files to upload - headers: Additional headers - content_type: Content type of the request. Defaults to application/json. - retry_count: Internal parameter for tracking retries, do not set manually - - Returns: - Parsed JSON response - - Raises: - LocalNetworkError: If local network connectivity issues are detected - ApiServerError: If the API server is unreachable but internet is working - Exception: For other request failures - """ - - # Build full URL and merge headers - relative_path = path.lstrip("/") - url = urljoin(self.base_url, relative_path) - self._check_auth(self.auth_token, self.comfy_api_key) - - request_headers = self.get_headers() - if headers: - request_headers.update(headers) - if files: - request_headers.pop("Content-Type", None) - if params: - params = {k: v for k, v in params.items() if v is not None} # aiohttp fails to serialize None values - - logging.debug("[DEBUG] Request Headers: %s", request_headers) - logging.debug("[DEBUG] Files: %s", files) - logging.debug("[DEBUG] Params: %s", params) - logging.debug("[DEBUG] Data: %s", data) - - if content_type == "application/x-www-form-urlencoded": - payload_args = self._create_urlencoded_form_data_args(data or {}, request_headers) - elif content_type == "multipart/form-data": - payload_args = self._create_form_data_args(data, files, request_headers, multipart_parser) - else: - payload_args = self._create_json_payload_args(data, request_headers) - - operation_id = self._generate_operation_id(path) - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, - request_url=url, - request_headers=request_headers, - request_params=params, - request_data=data if content_type == "application/json" else "[form-data or other]", - ) - - session = await self._get_session() - try: - async with session.request( - method, - url, - params=params, - ssl=self.verify_ssl, - **payload_args, - ) as resp: - if resp.status >= 400: - try: - error_data = await resp.json() - except (aiohttp.ContentTypeError, json.JSONDecodeError): - error_data = await resp.text() - - return await self._handle_http_error( - ClientResponseError(resp.request_info, resp.history, status=resp.status, message=error_data), - operation_id, - method, - url, - params, - data, - files, - headers, - content_type, - multipart_parser, - retry_count=retry_count, - response_content=error_data, - ) - - # Success – parse JSON (safely) and log - try: - payload = await resp.json() - response_content_to_log = payload - except (aiohttp.ContentTypeError, json.JSONDecodeError): - payload = {} - response_content_to_log = await resp.text() - - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, - request_url=url, - response_status_code=resp.status, - response_headers=dict(resp.headers), - response_content=response_content_to_log, - ) - return payload - - except (ClientError, asyncio.TimeoutError, socket.gaierror) as e: - # Treat as *connection* problem – optionally retry, else escalate - if retry_count < self.max_retries: - delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) - logging.warning("Connection error. Retrying in %.2fs (%s/%s): %s", delay, retry_count + 1, - self.max_retries, str(e)) - await asyncio.sleep(delay) - return await self.request( - method, - path, - params=params, - data=data, - files=files, - headers=headers, - content_type=content_type, - multipart_parser=multipart_parser, - retry_count=retry_count + 1, - ) - # One final connectivity check for diagnostics - connectivity = await self._check_connectivity(self.base_url) - if connectivity["is_local_issue"]: - raise LocalNetworkError( - "Unable to connect to the API server due to local network issues. " - "Please check your internet connection and try again." - ) from e - raise ApiServerError( - f"The API server at {self.base_url} is currently unreachable. " - f"The service may be experiencing issues. Please try again later." - ) from e - - @staticmethod - def _check_auth(auth_token, comfy_api_key): - """Verify that an auth token is present or comfy_api_key is present""" - if auth_token is None and comfy_api_key is None: - raise Exception("Unauthorized: Please login first to use this node.") - return auth_token or comfy_api_key - - @staticmethod - async def upload_file( - upload_url: str, - file: io.BytesIO | str, - content_type: str | None = None, - max_retries: int = 3, - retry_delay: float = 1.0, - retry_backoff_factor: float = 2.0, - ) -> aiohttp.ClientResponse: - """Upload a file to the API with retry logic. - - Args: - upload_url: The URL to upload to - file: Either a file path string, BytesIO object, or tuple of (file_path, filename) - content_type: Optional mime type to set for the upload - max_retries: Maximum number of retry attempts - retry_delay: Initial delay between retries in seconds - retry_backoff_factor: Multiplier for the delay after each retry - """ - headers: dict[str, str] = {} - skip_auto_headers: set[str] = set() - if content_type: - headers["Content-Type"] = content_type - else: - # tell aiohttp not to add Content-Type that will break the request signature and result in a 403 status. - skip_auto_headers.add("Content-Type") - - # Extract file bytes - if isinstance(file, io.BytesIO): - file.seek(0) - data = file.read() - elif isinstance(file, str): - with open(file, "rb") as f: - data = f.read() - else: - raise ValueError("File must be BytesIO or str path") - - parsed = urlparse(upload_url) - basename = os.path.basename(parsed.path) or parsed.netloc or "upload" - operation_id = f"upload_{basename}_{uuid.uuid4().hex[:8]}" - request_logger.log_request_response( - operation_id=operation_id, - request_method="PUT", - request_url=upload_url, - request_headers=headers, - request_data=f"[File data {len(data)} bytes]", - ) - - delay = retry_delay - for attempt in range(max_retries + 1): - try: - timeout = aiohttp.ClientTimeout(total=None) # honour server side timeouts - async with aiohttp.ClientSession(timeout=timeout) as session: - async with session.put( - upload_url, data=data, headers=headers, skip_auto_headers=skip_auto_headers, - ) as resp: - resp.raise_for_status() - request_logger.log_request_response( - operation_id=operation_id, - request_method="PUT", - request_url=upload_url, - response_status_code=resp.status, - response_headers=dict(resp.headers), - response_content="File uploaded successfully.", - ) - return resp - except (ClientError, asyncio.TimeoutError) as e: - request_logger.log_request_response( - operation_id=operation_id, - request_method="PUT", - request_url=upload_url, - response_status_code=e.status if hasattr(e, "status") else None, - response_headers=dict(e.headers) if hasattr(e, "headers") else None, - response_content=None, - error_message=f"{type(e).__name__}: {str(e)}", - ) - if attempt < max_retries: - logging.warning( - "Upload failed (%s/%s). Retrying in %.2fs. %s", attempt + 1, max_retries, delay, str(e) - ) - await asyncio.sleep(delay) - delay *= retry_backoff_factor - else: - raise NetworkError(f"Failed to upload file after {max_retries + 1} attempts: {e}") from e - - async def _handle_http_error( - self, - exc: ClientResponseError, - operation_id: str, - *req_meta, - retry_count: int, - response_content: dict | str = "", - ) -> dict[str, Any]: - status_code = exc.status - if status_code == 401: - user_friendly = "Unauthorized: Please login first to use this node." - elif status_code == 402: - user_friendly = "Payment Required: Please add credits to your account to use this node." - elif status_code == 409: - user_friendly = "There is a problem with your account. Please contact support@comfy.org." - elif status_code == 429: - user_friendly = "Rate Limit Exceeded: Please try again later." - else: - if isinstance(response_content, dict): - if "error" in response_content and "message" in response_content["error"]: - user_friendly = f"API Error: {response_content['error']['message']}" - if "type" in response_content["error"]: - user_friendly += f" (Type: {response_content['error']['type']})" - else: # Handle cases where error is just a JSON dict with unknown format - user_friendly = f"API Error: {json.dumps(response_content)}" - else: - if len(response_content) < 200: # Arbitrary limit for display - user_friendly = f"API Error (raw): {response_content}" - else: - user_friendly = f"API Error (raw, status {response_content})" - - request_logger.log_request_response( - operation_id=operation_id, - request_method=req_meta[0], - request_url=req_meta[1], - response_status_code=exc.status, - response_headers=dict(req_meta[5]) if req_meta[5] else None, - response_content=response_content, - error_message=f"HTTP Error {exc.status}", - ) - - logging.debug("[DEBUG] API Error: %s (Status: %s)", user_friendly, status_code) - if response_content: - logging.debug("[DEBUG] Response content: %s", response_content) - - # Retry if eligible - if status_code in self.retry_status_codes and retry_count < self.max_retries: - delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) - logging.warning( - "HTTP error %s. Retrying in %.2fs (%s/%s)", - status_code, - delay, - retry_count + 1, - self.max_retries, - ) - await asyncio.sleep(delay) - return await self.request( - req_meta[0], # method - req_meta[1].replace(self.base_url, ""), # path - params=req_meta[2], - data=req_meta[3], - files=req_meta[4], - headers=req_meta[5], - content_type=req_meta[6], - multipart_parser=req_meta[7], - retry_count=retry_count + 1, - ) - - raise Exception(user_friendly) from exc - - @staticmethod - def _unpack_tuple(t): - """Helper to normalise (filename, file, content_type) tuples.""" - if len(t) == 3: - return t - elif len(t) == 2: - return t[0], t[1], "application/octet-stream" - else: - raise ValueError("files tuple must be (filename, file[, content_type])") - - async def _get_session(self) -> aiohttp.ClientSession: - if self._session is None or self._session.closed: - timeout = aiohttp.ClientTimeout(total=self.timeout) - self._session = aiohttp.ClientSession(timeout=timeout) - self._owns_session = True - return self._session - - async def close(self) -> None: - if self._owns_session and self._session and not self._session.closed: - await self._session.close() - - async def __aenter__(self) -> "ApiClient": - """Allow usage as async‑context‑manager – ensures clean teardown""" - return self - - async def __aexit__(self, exc_type, exc, tb): - await self.close() - - -class ApiEndpoint(Generic[T, R]): - """Defines an API endpoint with its request and response types""" - - def __init__( - self, - path: str, - method: HttpMethod, - request_model: Type[T], - response_model: Type[R], - query_params: Optional[dict[str, Any]] = None, - ): - """Initialize an API endpoint definition. - - Args: - path: The URL path for this endpoint, can include placeholders like {id} - method: The HTTP method to use (GET, POST, etc.) - request_model: Pydantic model class that defines the structure and validation rules for API requests to this endpoint - response_model: Pydantic model class that defines the structure and validation rules for API responses from this endpoint - query_params: Optional dictionary of query parameters to include in the request - """ - self.path = path - self.method = method - self.request_model = request_model - self.response_model = response_model - self.query_params = query_params or {} - - -class SynchronousOperation(Generic[T, R]): - """Represents a single synchronous API operation.""" - - def __init__( - self, - endpoint: ApiEndpoint[T, R], - request: T, - files: Optional[dict[str, Any] | list[tuple[str, Any]]] = None, - api_base: str | None = None, - auth_token: Optional[str] = None, - comfy_api_key: Optional[str] = None, - auth_kwargs: Optional[dict[str, str]] = None, - timeout: float = 7200.0, - verify_ssl: bool = True, - content_type: str = "application/json", - multipart_parser: Callable | None = None, - max_retries: int = 3, - retry_delay: float = 1.0, - retry_backoff_factor: float = 2.0, - ) -> None: - self.endpoint = endpoint - self.request = request - self.files = files - self.api_base: str = api_base or args.comfy_api_base - self.auth_token = auth_token - self.comfy_api_key = comfy_api_key - if auth_kwargs is not None: - self.auth_token = auth_kwargs.get("auth_token", self.auth_token) - self.comfy_api_key = auth_kwargs.get("comfy_api_key", self.comfy_api_key) - self.timeout = timeout - self.verify_ssl = verify_ssl - self.content_type = content_type - self.multipart_parser = multipart_parser - self.max_retries = max_retries - self.retry_delay = retry_delay - self.retry_backoff_factor = retry_backoff_factor - - async def execute(self, client: Optional[ApiClient] = None) -> R: - owns_client = client is None - if owns_client: - client = ApiClient( - base_url=self.api_base, - auth_token=self.auth_token, - comfy_api_key=self.comfy_api_key, - timeout=self.timeout, - verify_ssl=self.verify_ssl, - max_retries=self.max_retries, - retry_delay=self.retry_delay, - retry_backoff_factor=self.retry_backoff_factor, - ) - - try: - request_dict: Optional[dict[str, Any]] - if isinstance(self.request, EmptyRequest): - request_dict = None - else: - request_dict = self.request.model_dump(exclude_none=True) - for k, v in list(request_dict.items()): - if isinstance(v, Enum): - request_dict[k] = v.value - - logging.debug("[DEBUG] API Request: %s %s", self.endpoint.method.value, self.endpoint.path) - logging.debug("[DEBUG] Request Data: %s", json.dumps(request_dict, indent=2)) - logging.debug("[DEBUG] Query Params: %s", self.endpoint.query_params) - - response_json = await client.request( - self.endpoint.method.value, - self.endpoint.path, - params=self.endpoint.query_params, - data=request_dict, - files=self.files, - content_type=self.content_type, - multipart_parser=self.multipart_parser, - ) - - logging.debug("=" * 50) - logging.debug("[DEBUG] RESPONSE DETAILS:") - logging.debug("[DEBUG] Status Code: 200 (Success)") - logging.debug("[DEBUG] Response Body: %s", json.dumps(response_json, indent=2)) - logging.debug("=" * 50) - - parsed_response = self.endpoint.response_model.model_validate(response_json) - logging.debug("[DEBUG] Parsed Response: %s", parsed_response) - return parsed_response - finally: - if owns_client: - await client.close() - - -class TaskStatus(str, Enum): - """Enum for task status values""" - - COMPLETED = "completed" - FAILED = "failed" - PENDING = "pending" - - -class PollingOperation(Generic[T, R]): - """Represents an asynchronous API operation that requires polling for completion.""" - - def __init__( - self, - poll_endpoint: ApiEndpoint[EmptyRequest, R], - completed_statuses: list[str], - failed_statuses: list[str], - *, - status_extractor: Callable[[R], Optional[str]], - progress_extractor: Callable[[R], Optional[float]] | None = None, - result_url_extractor: Callable[[R], Optional[str]] | None = None, - price_extractor: Callable[[R], Optional[float]] | None = None, - request: Optional[T] = None, - api_base: str | None = None, - auth_token: Optional[str] = None, - comfy_api_key: Optional[str] = None, - auth_kwargs: Optional[dict[str, str]] = None, - poll_interval: float = 5.0, - max_poll_attempts: int = 120, # Default max polling attempts (10 minutes with 5s interval) - max_retries: int = 3, # Max retries per individual API call - retry_delay: float = 1.0, - retry_backoff_factor: float = 2.0, - estimated_duration: Optional[float] = None, - node_id: Optional[str] = None, - ) -> None: - self.poll_endpoint = poll_endpoint - self.request = request - self.api_base: str = api_base or args.comfy_api_base - self.auth_token = auth_token - self.comfy_api_key = comfy_api_key - if auth_kwargs is not None: - self.auth_token = auth_kwargs.get("auth_token", self.auth_token) - self.comfy_api_key = auth_kwargs.get("comfy_api_key", self.comfy_api_key) - self.poll_interval = poll_interval - self.max_poll_attempts = max_poll_attempts - self.max_retries = max_retries - self.retry_delay = retry_delay - self.retry_backoff_factor = retry_backoff_factor - self.estimated_duration = estimated_duration - self.status_extractor = status_extractor or (lambda x: getattr(x, "status", None)) - self.progress_extractor = progress_extractor - self.result_url_extractor = result_url_extractor - self.price_extractor = price_extractor - self.node_id = node_id - self.completed_statuses = completed_statuses - self.failed_statuses = failed_statuses - self.final_response: Optional[R] = None - self.extracted_price: Optional[float] = None - - async def execute(self, client: Optional[ApiClient] = None) -> R: - owns_client = client is None - if owns_client: - client = ApiClient( - base_url=self.api_base, - auth_token=self.auth_token, - comfy_api_key=self.comfy_api_key, - max_retries=self.max_retries, - retry_delay=self.retry_delay, - retry_backoff_factor=self.retry_backoff_factor, - ) - try: - return await self._poll_until_complete(client) - finally: - if owns_client: - await client.close() - - def _display_text_on_node(self, text: str): - if not self.node_id: - return - if self.extracted_price is not None: - text = f"Price: ${self.extracted_price}\n{text}" - PromptServer.instance.send_progress_text(text, self.node_id) - - def _display_time_progress_on_node(self, time_completed: int | float): - if not self.node_id: - return - if self.estimated_duration is not None: - remaining = max(0, int(self.estimated_duration) - time_completed) - message = f"Task in progress: {time_completed}s (~{remaining}s remaining)" - else: - message = f"Task in progress: {time_completed}s" - self._display_text_on_node(message) - - def _check_task_status(self, response: R) -> TaskStatus: - try: - status = self.status_extractor(response) - if status in self.completed_statuses: - return TaskStatus.COMPLETED - if status in self.failed_statuses: - return TaskStatus.FAILED - return TaskStatus.PENDING - except Exception as e: - logging.error("Error extracting status: %s", e) - return TaskStatus.PENDING - - async def _poll_until_complete(self, client: ApiClient) -> R: - """Poll until the task is complete""" - consecutive_errors = 0 - max_consecutive_errors = min(5, self.max_retries * 2) # Limit consecutive errors - - if self.progress_extractor: - progress = utils.ProgressBar(PROGRESS_BAR_MAX) - - status = TaskStatus.PENDING - for poll_count in range(1, self.max_poll_attempts + 1): - try: - logging.debug("[DEBUG] Polling attempt #%s", poll_count) - - request_dict = None if self.request is None else self.request.model_dump(exclude_none=True) - - if poll_count == 1: - logging.debug( - "[DEBUG] Poll Request: %s %s", - self.poll_endpoint.method.value, - self.poll_endpoint.path, - ) - logging.debug( - "[DEBUG] Poll Request Data: %s", - json.dumps(request_dict, indent=2) if request_dict else "None", - ) - - # Query task status - resp = await client.request( - self.poll_endpoint.method.value, - self.poll_endpoint.path, - params=self.poll_endpoint.query_params, - data=request_dict, - ) - consecutive_errors = 0 # reset on success - response_obj: R = self.poll_endpoint.response_model.model_validate(resp) - - # Check if task is complete - status = self._check_task_status(response_obj) - logging.debug("[DEBUG] Task Status: %s", status) - - # If progress extractor is provided, extract progress - if self.progress_extractor: - new_progress = self.progress_extractor(response_obj) - if new_progress is not None: - progress.update_absolute(new_progress, total=PROGRESS_BAR_MAX) - - if self.price_extractor: - price = self.price_extractor(response_obj) - if price is not None: - self.extracted_price = price - - if status == TaskStatus.COMPLETED: - message = "Task completed successfully" - if self.result_url_extractor: - result_url = self.result_url_extractor(response_obj) - if result_url: - message = f"Result URL: {result_url}" - logging.debug("[DEBUG] %s", message) - self._display_text_on_node(message) - self.final_response = response_obj - if self.progress_extractor: - progress.update(100) - return self.final_response - if status == TaskStatus.FAILED: - message = f"Task failed: {json.dumps(resp)}" - logging.error("[DEBUG] %s", message) - raise Exception(message) - logging.debug("[DEBUG] Task still pending, continuing to poll...") - # Task pending – wait - for i in range(int(self.poll_interval)): - self._display_time_progress_on_node((poll_count - 1) * self.poll_interval + i) - await asyncio.sleep(1) - - except (LocalNetworkError, ApiServerError, NetworkError) as e: - consecutive_errors += 1 - if consecutive_errors >= max_consecutive_errors: - raise Exception( - f"Polling aborted after {consecutive_errors} network errors: {str(e)}" - ) from e - logging.warning( - "Network error (%s/%s): %s", - consecutive_errors, - max_consecutive_errors, - str(e), - ) - await asyncio.sleep(self.poll_interval) - except Exception as e: - # For other errors, increment count and potentially abort - consecutive_errors += 1 - if consecutive_errors >= max_consecutive_errors or status == TaskStatus.FAILED: - raise Exception( - f"Polling aborted after {consecutive_errors} consecutive errors: {str(e)}" - ) from e - - logging.error("[DEBUG] Polling error: %s", str(e)) - logging.warning( - "Error during polling (attempt %s/%s): %s. Will retry in %s seconds.", - poll_count, - self.max_poll_attempts, - str(e), - self.poll_interval, - ) - await asyncio.sleep(self.poll_interval) - - # If we've exhausted all polling attempts - raise Exception( - f"Polling timed out after {self.max_poll_attempts} attempts (" f"{self.max_poll_attempts * self.poll_interval} seconds). " - "The operation may still be running on the server but is taking longer than expected." - ) diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index ad4029236..e60e7a6d6 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -5,12 +5,9 @@ Rodin API docs: https://developer.hyper3d.ai/ """ -from __future__ import annotations from inspect import cleandoc import folder_paths as comfy_paths -import aiohttp import os -import asyncio import logging import math from typing import Optional @@ -26,11 +23,11 @@ from comfy_api_nodes.apis.rodin_api import ( Rodin3DDownloadResponse, JobStatus, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( + sync_op, + poll_op, ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, + download_url_to_bytesio, ) from comfy_api.latest import ComfyExtension, IO @@ -121,35 +118,31 @@ def tensor_to_filelike(tensor, max_pixels: int = 2048*2048): async def create_generate_task( + cls: type[IO.ComfyNode], images=None, seed=1, material="PBR", quality_override=18000, tier="Regular", mesh_mode="Quad", - TAPose = False, - auth_kwargs: Optional[dict[str, str]] = None, + ta_pose: bool = False, ): if images is None: raise Exception("Rodin 3D generate requires at least 1 image.") if len(images) > 5: raise Exception("Rodin 3D generate requires up to 5 image.") - path = "/proxy/rodin/api/v2/rodin" - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=Rodin3DGenerateRequest, - response_model=Rodin3DGenerateResponse, - ), - request=Rodin3DGenerateRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/rodin/api/v2/rodin", method="POST"), + response_model=Rodin3DGenerateResponse, + data=Rodin3DGenerateRequest( seed=seed, tier=tier, material=material, quality_override=quality_override, mesh_mode=mesh_mode, - TAPose=TAPose, + TAPose=ta_pose, ), files=[ ( @@ -159,11 +152,8 @@ async def create_generate_task( for image in images if image is not None ], content_type="multipart/form-data", - auth_kwargs=auth_kwargs, ) - response = await operation.execute() - if hasattr(response, "error"): error_message = f"Rodin3D Create 3D generate Task Failed. Message: {response.message}, error: {response.error}" logging.error(error_message) @@ -187,74 +177,46 @@ def check_rodin_status(response: Rodin3DCheckStatusResponse) -> str: return "DONE" return "Generating" +def extract_progress(response: Rodin3DCheckStatusResponse) -> Optional[int]: + if not response.jobs: + return None + completed_count = sum(1 for job in response.jobs if job.status == JobStatus.Done) + return int((completed_count / len(response.jobs)) * 100) -async def poll_for_task_status( - subscription_key, auth_kwargs: Optional[dict[str, str]] = None, -) -> Rodin3DCheckStatusResponse: - poll_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path="/proxy/rodin/api/v2/status", - method=HttpMethod.POST, - request_model=Rodin3DCheckStatusRequest, - response_model=Rodin3DCheckStatusResponse, - ), - request=Rodin3DCheckStatusRequest(subscription_key=subscription_key), - completed_statuses=["DONE"], - failed_statuses=["FAILED"], - status_extractor=check_rodin_status, - poll_interval=3.0, - auth_kwargs=auth_kwargs, - ) + +async def poll_for_task_status(subscription_key: str, cls: type[IO.ComfyNode]) -> Rodin3DCheckStatusResponse: logging.info("[ Rodin3D API - CheckStatus ] Generate Start!") - return await poll_operation.execute() - - -async def get_rodin_download_list(uuid, auth_kwargs: Optional[dict[str, str]] = None) -> Rodin3DDownloadResponse: - logging.info("[ Rodin3D API - Downloading ] Generate Successfully!") - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/rodin/api/v2/download", - method=HttpMethod.POST, - request_model=Rodin3DDownloadRequest, - response_model=Rodin3DDownloadResponse, - ), - request=Rodin3DDownloadRequest(task_uuid=uuid), - auth_kwargs=auth_kwargs, + return await poll_op( + cls, + ApiEndpoint(path="/proxy/rodin/api/v2/status", method="POST"), + response_model=Rodin3DCheckStatusResponse, + data=Rodin3DCheckStatusRequest(subscription_key=subscription_key), + status_extractor=check_rodin_status, + progress_extractor=extract_progress, ) - return await operation.execute() -async def download_files(url_list, task_uuid): +async def get_rodin_download_list(uuid: str, cls: type[IO.ComfyNode]) -> Rodin3DDownloadResponse: + logging.info("[ Rodin3D API - Downloading ] Generate Successfully!") + return await sync_op( + cls, + ApiEndpoint(path="/proxy/rodin/api/v2/download", method="POST"), + response_model=Rodin3DDownloadResponse, + data=Rodin3DDownloadRequest(task_uuid=uuid), + monitor_progress=False, + ) + + +async def download_files(url_list, task_uuid: str): result_folder_name = f"Rodin3D_{task_uuid}" save_path = os.path.join(comfy_paths.get_output_directory(), result_folder_name) os.makedirs(save_path, exist_ok=True) model_file_path = None - async with aiohttp.ClientSession() as session: - for i in url_list.list: - file_path = os.path.join(save_path, i.name) - if file_path.endswith(".glb"): - model_file_path = os.path.join(result_folder_name, i.name) - logging.info("[ Rodin3D API - download_files ] Downloading file: %s", file_path) - max_retries = 5 - for attempt in range(max_retries): - try: - async with session.get(i.url) as resp: - resp.raise_for_status() - with open(file_path, "wb") as f: - async for chunk in resp.content.iter_chunked(32 * 1024): - f.write(chunk) - break - except Exception as e: - logging.info("[ Rodin3D API - download_files ] Error downloading %s:%s", file_path, str(e)) - if attempt < max_retries - 1: - logging.info("Retrying...") - await asyncio.sleep(2) - else: - logging.info( - "[ Rodin3D API - download_files ] Failed to download %s after %s attempts.", - file_path, - max_retries, - ) + for i in url_list.list: + file_path = os.path.join(save_path, i.name) + if file_path.endswith(".glb"): + model_file_path = os.path.join(result_folder_name, i.name) + await download_url_to_bytesio(i.url, file_path) return model_file_path @@ -276,6 +238,7 @@ class Rodin3D_Regular(IO.ComfyNode): hidden=[ IO.Hidden.auth_token_comfy_org, IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -294,21 +257,17 @@ class Rodin3D_Regular(IO.ComfyNode): for i in range(num_images): m_images.append(Images[i]) mesh_mode, quality_override = get_quality_mode(Polygon_count) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } task_uuid, subscription_key = await create_generate_task( + cls, images=m_images, seed=Seed, material=Material_Type, quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, - auth_kwargs=auth, ) - await poll_for_task_status(subscription_key, auth_kwargs=auth) - download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) + await poll_for_task_status(subscription_key, cls) + download_list = await get_rodin_download_list(task_uuid, cls) model = await download_files(download_list, task_uuid) return IO.NodeOutput(model) @@ -332,6 +291,7 @@ class Rodin3D_Detail(IO.ComfyNode): hidden=[ IO.Hidden.auth_token_comfy_org, IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -350,21 +310,17 @@ class Rodin3D_Detail(IO.ComfyNode): for i in range(num_images): m_images.append(Images[i]) mesh_mode, quality_override = get_quality_mode(Polygon_count) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } task_uuid, subscription_key = await create_generate_task( + cls, images=m_images, seed=Seed, material=Material_Type, quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, - auth_kwargs=auth, ) - await poll_for_task_status(subscription_key, auth_kwargs=auth) - download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) + await poll_for_task_status(subscription_key, cls) + download_list = await get_rodin_download_list(task_uuid, cls) model = await download_files(download_list, task_uuid) return IO.NodeOutput(model) @@ -388,6 +344,7 @@ class Rodin3D_Smooth(IO.ComfyNode): hidden=[ IO.Hidden.auth_token_comfy_org, IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -400,27 +357,22 @@ class Rodin3D_Smooth(IO.ComfyNode): Material_Type, Polygon_count, ) -> IO.NodeOutput: - tier = "Smooth" num_images = Images.shape[0] m_images = [] for i in range(num_images): m_images.append(Images[i]) mesh_mode, quality_override = get_quality_mode(Polygon_count) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } task_uuid, subscription_key = await create_generate_task( + cls, images=m_images, seed=Seed, material=Material_Type, quality_override=quality_override, - tier=tier, + tier="Smooth", mesh_mode=mesh_mode, - auth_kwargs=auth, ) - await poll_for_task_status(subscription_key, auth_kwargs=auth) - download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) + await poll_for_task_status(subscription_key, cls) + download_list = await get_rodin_download_list(task_uuid, cls) model = await download_files(download_list, task_uuid) return IO.NodeOutput(model) @@ -451,6 +403,7 @@ class Rodin3D_Sketch(IO.ComfyNode): hidden=[ IO.Hidden.auth_token_comfy_org, IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -461,29 +414,21 @@ class Rodin3D_Sketch(IO.ComfyNode): Images, Seed, ) -> IO.NodeOutput: - tier = "Sketch" num_images = Images.shape[0] m_images = [] for i in range(num_images): m_images.append(Images[i]) - material_type = "PBR" - quality_override = 18000 - mesh_mode = "Quad" - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } task_uuid, subscription_key = await create_generate_task( + cls, images=m_images, seed=Seed, - material=material_type, - quality_override=quality_override, - tier=tier, - mesh_mode=mesh_mode, - auth_kwargs=auth, + material="PBR", + quality_override=18000, + tier="Sketch", + mesh_mode="Quad", ) - await poll_for_task_status(subscription_key, auth_kwargs=auth) - download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) + await poll_for_task_status(subscription_key, cls) + download_list = await get_rodin_download_list(task_uuid, cls) model = await download_files(download_list, task_uuid) return IO.NodeOutput(model) @@ -522,6 +467,7 @@ class Rodin3D_Gen2(IO.ComfyNode): hidden=[ IO.Hidden.auth_token_comfy_org, IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -541,22 +487,18 @@ class Rodin3D_Gen2(IO.ComfyNode): for i in range(num_images): m_images.append(Images[i]) mesh_mode, quality_override = get_quality_mode(Polygon_count) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } task_uuid, subscription_key = await create_generate_task( + cls, images=m_images, seed=Seed, material=Material_Type, quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, - TAPose=TAPose, - auth_kwargs=auth, + ta_pose=TAPose, ) - await poll_for_task_status(subscription_key, auth_kwargs=auth) - download_list = await get_rodin_download_list(task_uuid, auth_kwargs=auth) + await poll_for_task_status(subscription_key, cls) + download_list = await get_rodin_download_list(task_uuid, cls) model = await download_files(download_list, task_uuid) return IO.NodeOutput(model) diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py index 65bb35f0f..2d5dcd648 100644 --- a/comfy_api_nodes/util/client.py +++ b/comfy_api_nodes/util/client.py @@ -16,9 +16,9 @@ from pydantic import BaseModel from comfy import utils from comfy_api.latest import IO -from comfy_api_nodes.apis import request_logger from server import PromptServer +from . import request_logger from ._helpers import ( default_base_url, get_auth_header, @@ -77,7 +77,7 @@ class _PollUIState: _RETRY_STATUS = {408, 429, 500, 502, 503, 504} -COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished"] +COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done"] FAILED_STATUSES = ["cancelled", "canceled", "fail", "failed", "error"] QUEUED_STATUSES = ["created", "queued", "queueing", "submitted"] diff --git a/comfy_api_nodes/util/download_helpers.py b/comfy_api_nodes/util/download_helpers.py index 364874bed..14207dc68 100644 --- a/comfy_api_nodes/util/download_helpers.py +++ b/comfy_api_nodes/util/download_helpers.py @@ -12,8 +12,8 @@ from aiohttp.client_exceptions import ClientError, ContentTypeError from comfy_api.input_impl import VideoFromFile from comfy_api.latest import IO as COMFY_IO -from comfy_api_nodes.apis import request_logger +from . import request_logger from ._helpers import ( default_base_url, get_auth_header, diff --git a/comfy_api_nodes/apis/request_logger.py b/comfy_api_nodes/util/request_logger.py similarity index 100% rename from comfy_api_nodes/apis/request_logger.py rename to comfy_api_nodes/util/request_logger.py index c6974d35c..ac52e2eab 100644 --- a/comfy_api_nodes/apis/request_logger.py +++ b/comfy_api_nodes/util/request_logger.py @@ -1,11 +1,11 @@ from __future__ import annotations -import os import datetime +import hashlib import json import logging +import os import re -import hashlib from typing import Any import folder_paths diff --git a/comfy_api_nodes/util/upload_helpers.py b/comfy_api_nodes/util/upload_helpers.py index 7bfc61704..632450d9b 100644 --- a/comfy_api_nodes/util/upload_helpers.py +++ b/comfy_api_nodes/util/upload_helpers.py @@ -13,8 +13,8 @@ from pydantic import BaseModel, Field from comfy_api.latest import IO, Input from comfy_api.util import VideoCodec, VideoContainer -from comfy_api_nodes.apis import request_logger +from . import request_logger from ._helpers import is_processing_interrupted, sleep_with_interrupt from .client import ( ApiEndpoint, From 97f198e4215680a83749ba95849f3cdcfa7aa64a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 5 Nov 2025 15:07:35 -0800 Subject: [PATCH 545/759] Fix qwen controlnet regression. (#10657) --- comfy/ldm/qwen_image/controlnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ldm/qwen_image/controlnet.py b/comfy/ldm/qwen_image/controlnet.py index 92ac3cf0a..a6d408104 100644 --- a/comfy/ldm/qwen_image/controlnet.py +++ b/comfy/ldm/qwen_image/controlnet.py @@ -44,7 +44,7 @@ class QwenImageControlNetModel(QwenImageTransformer2DModel): txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2)) txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) ids = torch.cat((txt_ids, img_ids), dim=1) - image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) + image_rotary_emb = self.pe_embedder(ids).to(x.dtype).contiguous() del ids, txt_ids, img_ids hidden_states = self.img_in(hidden_states) + self.controlnet_x_embedder(hint) From 1d69245981f9fb3861018613246042296d887dd3 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 5 Nov 2025 15:08:13 -0800 Subject: [PATCH 546/759] Enable pinned memory by default on Nvidia. (#10656) Removed the --fast pinned_memory flag. You can use --disable-pinned-memory to disable it. Please report if it causes any issues. --- comfy/cli_args.py | 3 ++- comfy/model_management.py | 22 +++++++++------------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 3947e62a8..2f30b72d2 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -145,10 +145,11 @@ class PerformanceFeature(enum.Enum): Fp8MatrixMultiplication = "fp8_matrix_mult" CublasOps = "cublas_ops" AutoTune = "autotune" - PinnedMem = "pinned_memory" parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. This is used to test new features so using it might crash your comfyui. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature)))) +parser.add_argument("--disable-pinned-memory", action="store_true", help="Disable pinned memory use.") + parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.") parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.") diff --git a/comfy/model_management.py b/comfy/model_management.py index 0d040e55e..4d13c52c1 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1085,22 +1085,21 @@ def cast_to_device(tensor, device, dtype, copy=False): PINNED_MEMORY = {} TOTAL_PINNED_MEMORY = 0 -if PerformanceFeature.PinnedMem in args.fast: - if WINDOWS: - MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.45 # Windows limit is apparently 50% - else: - MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95 -else: - MAX_PINNED_MEMORY = -1 +MAX_PINNED_MEMORY = -1 +if not args.disable_pinned_memory: + if is_nvidia(): + if WINDOWS: + MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.45 # Windows limit is apparently 50% + else: + MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95 + logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024))) + def pin_memory(tensor): global TOTAL_PINNED_MEMORY if MAX_PINNED_MEMORY <= 0: return False - if not is_nvidia(): - return False - if not is_device_cpu(tensor.device): return False @@ -1121,9 +1120,6 @@ def unpin_memory(tensor): if MAX_PINNED_MEMORY <= 0: return False - if not is_nvidia(): - return False - if not is_device_cpu(tensor.device): return False From 09dc24c8a982776abd5cb2f71e3d041139e1d5b2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 5 Nov 2025 16:11:15 -0800 Subject: [PATCH 547/759] Pinned mem also seems to work on AMD. (#10658) --- comfy/model_management.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 4d13c52c1..7a30c4bec 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1087,7 +1087,7 @@ PINNED_MEMORY = {} TOTAL_PINNED_MEMORY = 0 MAX_PINNED_MEMORY = -1 if not args.disable_pinned_memory: - if is_nvidia(): + if is_nvidia() or is_amd(): if WINDOWS: MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.45 # Windows limit is apparently 50% else: From e05c90712670fa4a2ffebd44046fc78747193a36 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 6 Nov 2025 01:11:30 -0800 Subject: [PATCH 548/759] Clarify release cycle. (#10667) --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4204777e9..8142f595b 100644 --- a/README.md +++ b/README.md @@ -112,10 +112,11 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git ## Release Process -ComfyUI follows a weekly release cycle targeting Friday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories: +ComfyUI follows a weekly release cycle targeting Monday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories: 1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)** - - Releases a new stable version (e.g., v0.7.0) + - Releases a new stable version (e.g., v0.7.0) roughly every week. + - Commits outside of the stable release tags may be very unstable and break many custom nodes. - Serves as the foundation for the desktop release 2. **[ComfyUI Desktop](https://github.com/Comfy-Org/desktop)** From eb1c42f6498ce44aef4dbed3bb665ac98a28254d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 6 Nov 2025 17:24:28 -0800 Subject: [PATCH 549/759] Tell users they need to upload their logs in bug reports. (#10671) --- .github/ISSUE_TEMPLATE/bug-report.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 3cf2717b7..6556677e0 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -8,13 +8,15 @@ body: Before submitting a **Bug Report**, please ensure the following: - **1:** You are running the latest version of ComfyUI. - - **2:** You have looked at the existing bug reports and made sure this isn't already reported. + - **2:** You have your ComfyUI logs and relevant workflow on hand and will post them in this bug report. - **3:** You confirmed that the bug is not caused by a custom node. You can disable all custom nodes by passing - `--disable-all-custom-nodes` command line argument. + `--disable-all-custom-nodes` command line argument. If you have custom node try updating them to the latest version. - **4:** This is an actual bug in ComfyUI, not just a support question. A bug is when you can specify exact steps to replicate what went wrong and others will be able to repeat your steps and see the same issue happen. - If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first. + ## Very Important + + Please make sure that you post ALL your ComfyUI logs in the bug report. A bug report without logs will likely be ignored. - type: checkboxes id: custom-nodes-test attributes: From cf97b033ee80cf245b4592d42f89e6de67e409a4 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Fri, 7 Nov 2025 12:20:48 +1000 Subject: [PATCH 550/759] mm: guard against double pin and unpin explicitly (#10672) As commented, if you let cuda be the one to detect double pin/unpinning it actually creates an asyc GPU error. --- comfy/model_management.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/comfy/model_management.py b/comfy/model_management.py index 7a30c4bec..a13b24cea 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1103,6 +1103,12 @@ def pin_memory(tensor): if not is_device_cpu(tensor.device): return False + if tensor.is_pinned(): + #NOTE: Cuda does detect when a tensor is already pinned and would + #error below, but there are proven cases where this also queues an error + #on the GPU async. So dont trust the CUDA API and guard here + return False + size = tensor.numel() * tensor.element_size() if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY: return False @@ -1123,6 +1129,12 @@ def unpin_memory(tensor): if not is_device_cpu(tensor.device): return False + if not tensor.is_pinned(): + #NOTE: Cuda does detect when a tensor is already pinned and would + #error below, but there are proven cases where this also queues an error + #on the GPU async. So dont trust the CUDA API and guard here + return False + ptr = tensor.data_ptr() if torch.cuda.cudart().cudaHostUnregister(ptr) == 0: TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr) From a1a70362ca376cff05a0514e0ce771ab26d92fd9 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 7 Nov 2025 08:15:05 -0800 Subject: [PATCH 551/759] Only unpin tensor if it was pinned by ComfyUI (#10677) --- comfy/model_management.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index a13b24cea..7012df858 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1129,13 +1129,18 @@ def unpin_memory(tensor): if not is_device_cpu(tensor.device): return False - if not tensor.is_pinned(): - #NOTE: Cuda does detect when a tensor is already pinned and would - #error below, but there are proven cases where this also queues an error - #on the GPU async. So dont trust the CUDA API and guard here + ptr = tensor.data_ptr() + size = tensor.numel() * tensor.element_size() + + size_stored = PINNED_MEMORY.get(ptr, None) + if size_stored is None: + logging.warning("Tried to unpin tensor not pinned by ComfyUI") + return False + + if size != size_stored: + logging.warning("Size of pinned tensor changed") return False - ptr = tensor.data_ptr() if torch.cuda.cudart().cudaHostUnregister(ptr) == 0: TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr) if len(PINNED_MEMORY) == 0: From 2abd2b5c2049a9625b342bcb7decedd5d1645f66 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 8 Nov 2025 12:52:02 -0800 Subject: [PATCH 552/759] Make ScaleROPE node work on Flux. (#10686) --- comfy/ldm/flux/model.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 14f90cea5..b9d36f202 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -210,7 +210,7 @@ class Flux(nn.Module): img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels) return img - def process_img(self, x, index=0, h_offset=0, w_offset=0): + def process_img(self, x, index=0, h_offset=0, w_offset=0, transformer_options={}): bs, c, h, w = x.shape patch_size = self.patch_size x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size)) @@ -222,10 +222,22 @@ class Flux(nn.Module): h_offset = ((h_offset + (patch_size // 2)) // patch_size) w_offset = ((w_offset + (patch_size // 2)) // patch_size) - img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) + steps_h = h_len + steps_w = w_len + + rope_options = transformer_options.get("rope_options", None) + if rope_options is not None: + h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0 + w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0 + + index += rope_options.get("shift_t", 0.0) + h_offset += rope_options.get("shift_y", 0.0) + w_offset += rope_options.get("shift_x", 0.0) + + img_ids = torch.zeros((steps_h, steps_w, 3), device=x.device, dtype=x.dtype) img_ids[:, :, 0] = img_ids[:, :, 1] + index - img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) + img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=x.dtype).unsqueeze(1) + img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=x.dtype).unsqueeze(0) return img, repeat(img_ids, "h w c -> b (h w) c", b=bs) def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs): @@ -241,7 +253,7 @@ class Flux(nn.Module): h_len = ((h_orig + (patch_size // 2)) // patch_size) w_len = ((w_orig + (patch_size // 2)) // patch_size) - img, img_ids = self.process_img(x) + img, img_ids = self.process_img(x, transformer_options=transformer_options) img_tokens = img.shape[1] if ref_latents is not None: h = 0 From e632e5de281b91dd7199636dd6d82126fbfb07d5 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 9 Nov 2025 15:06:39 -0800 Subject: [PATCH 553/759] Add logging for model unloading. (#10692) --- comfy/model_patcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 5a31a8734..17e06a869 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -909,6 +909,7 @@ class ModelPatcher: self.model.model_lowvram = True self.model.lowvram_patch_counter += patch_counter self.model.model_loaded_weight_memory -= memory_freed + logging.info("loaded partially: {:.2f} MB loaded, lowvram patches: {}".format(self.model.model_loaded_weight_memory / (1024 * 1024), self.model.lowvram_patch_counter)) return memory_freed def partially_load(self, device_to, extra_memory=0, force_patch_weights=False): From dea899f22125d38a8b48147d6cce89a2b659fdeb Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 9 Nov 2025 15:51:33 -0800 Subject: [PATCH 554/759] Unload weights if vram usage goes up between runs. (#10690) --- comfy/model_management.py | 11 +++++++++-- comfy/model_patcher.py | 20 +++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 7012df858..a4410f2ec 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -503,7 +503,11 @@ class LoadedModel: use_more_vram = lowvram_model_memory if use_more_vram == 0: use_more_vram = 1e32 - self.model_use_more_vram(use_more_vram, force_patch_weights=force_patch_weights) + if use_more_vram > 0: + self.model_use_more_vram(use_more_vram, force_patch_weights=force_patch_weights) + else: + self.model.partially_unload(self.model.offload_device, -use_more_vram, force_patch_weights=force_patch_weights) + real_model = self.model.model if is_intel_xpu() and not args.disable_ipex_optimize and 'ipex' in globals() and real_model is not None: @@ -689,7 +693,10 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu current_free_mem = get_free_memory(torch_dev) + loaded_memory lowvram_model_memory = max(128 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory())) - lowvram_model_memory = max(0.1, lowvram_model_memory - loaded_memory) + lowvram_model_memory = lowvram_model_memory - loaded_memory + + if lowvram_model_memory == 0: + lowvram_model_memory = 0.1 if vram_set_state == VRAMState.NO_VRAM: lowvram_model_memory = 0.1 diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 17e06a869..68b0a9192 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -843,7 +843,7 @@ class ModelPatcher: self.object_patches_backup.clear() - def partially_unload(self, device_to, memory_to_free=0): + def partially_unload(self, device_to, memory_to_free=0, force_patch_weights=False): with self.use_ejected(): hooks_unpatched = False memory_freed = 0 @@ -887,13 +887,19 @@ class ModelPatcher: module_mem += move_weight_functions(m, device_to) if lowvram_possible: if weight_key in self.patches: - _, set_func, convert_func = get_key_weight(self.model, weight_key) - m.weight_function.append(LowVramPatch(weight_key, self.patches, convert_func, set_func)) - patch_counter += 1 + if force_patch_weights: + self.patch_weight_to_device(weight_key) + else: + _, set_func, convert_func = get_key_weight(self.model, weight_key) + m.weight_function.append(LowVramPatch(weight_key, self.patches, convert_func, set_func)) + patch_counter += 1 if bias_key in self.patches: - _, set_func, convert_func = get_key_weight(self.model, bias_key) - m.bias_function.append(LowVramPatch(bias_key, self.patches, convert_func, set_func)) - patch_counter += 1 + if force_patch_weights: + self.patch_weight_to_device(bias_key) + else: + _, set_func, convert_func = get_key_weight(self.model, bias_key) + m.bias_function.append(LowVramPatch(bias_key, self.patches, convert_func, set_func)) + patch_counter += 1 cast_weight = True if cast_weight: From c350009236e5d172a3050c04043ea70a301378ca Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Mon, 10 Nov 2025 13:52:11 +1000 Subject: [PATCH 555/759] ops: Put weight cast on the offload stream (#10697) This needs to be on the offload stream. This reproduced a black screen with low resolution images on a slow bus when using FP8. --- comfy/ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 733bff99d..96dffa85d 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -110,9 +110,9 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of for f in s.bias_function: bias = f(bias) - weight = weight.to(dtype=dtype) - if weight_has_function: + if weight_has_function or weight.dtype != dtype: with wf_context: + weight = weight.to(dtype=dtype) for f in s.weight_function: weight = f(weight) From 5ebcab3c7d974963a89cecd37296a22fdb73bd2b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 10 Nov 2025 12:35:29 -0800 Subject: [PATCH 556/759] Update CI workflow to remove dead macOS runner. (#10704) * Update CI workflow to remove dead macOS runner. * revert * revert --- .github/workflows/test-ci.yml | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test-ci.yml b/.github/workflows/test-ci.yml index 418dca0ab..1660ec8e3 100644 --- a/.github/workflows/test-ci.yml +++ b/.github/workflows/test-ci.yml @@ -21,14 +21,15 @@ jobs: fail-fast: false matrix: # os: [macos, linux, windows] - os: [macos, linux] - python_version: ["3.9", "3.10", "3.11", "3.12"] + # os: [macos, linux] + os: [linux] + python_version: ["3.10", "3.11", "3.12"] cuda_version: ["12.1"] torch_version: ["stable"] include: - - os: macos - runner_label: [self-hosted, macOS] - flags: "--use-pytorch-cross-attention" + # - os: macos + # runner_label: [self-hosted, macOS] + # flags: "--use-pytorch-cross-attention" - os: linux runner_label: [self-hosted, Linux] flags: "" @@ -73,14 +74,15 @@ jobs: strategy: fail-fast: false matrix: - os: [macos, linux] + # os: [macos, linux] + os: [linux] python_version: ["3.11"] cuda_version: ["12.1"] torch_version: ["nightly"] include: - - os: macos - runner_label: [self-hosted, macOS] - flags: "--use-pytorch-cross-attention" + # - os: macos + # runner_label: [self-hosted, macOS] + # flags: "--use-pytorch-cross-attention" - os: linux runner_label: [self-hosted, Linux] flags: "" From 119941174704081a16a4c3f303d99f2fb1e95cde Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 11 Nov 2025 16:33:30 -0800 Subject: [PATCH 557/759] Don't pin tensor if not a torch.nn.parameter.Parameter (#10718) --- comfy/model_management.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/comfy/model_management.py b/comfy/model_management.py index a4410f2ec..d8913082a 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1107,6 +1107,9 @@ def pin_memory(tensor): if MAX_PINNED_MEMORY <= 0: return False + if type(tensor) is not torch.nn.parameter.Parameter: + return False + if not is_device_cpu(tensor.device): return False @@ -1116,6 +1119,9 @@ def pin_memory(tensor): #on the GPU async. So dont trust the CUDA API and guard here return False + if not tensor.is_contiguous(): + return False + size = tensor.numel() * tensor.element_size() if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY: return False From e1d85e7577d8f6355bd4cb3449bcb0a7e5f80cb8 Mon Sep 17 00:00:00 2001 From: Qiacheng Li Date: Wed, 12 Nov 2025 12:21:05 -0800 Subject: [PATCH 558/759] Update README.md for Intel Arc GPU installation, remove IPEX (#10729) IPEX is no longer needed for Intel Arc GPUs. Removing instruction to setup ipex. --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index 8142f595b..9e28803a2 100644 --- a/README.md +++ b/README.md @@ -242,7 +242,7 @@ RDNA 4 (RX 9000 series): ### Intel GPUs (Windows and Linux) -(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html) +Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html) 1. To install PyTorch xpu, use the following command: @@ -252,10 +252,6 @@ This is the command to install the Pytorch xpu nightly which might have some per ```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu``` -(Option 2) Alternatively, Intel GPUs supported by Intel Extension for PyTorch (IPEX) can leverage IPEX for improved performance. - -1. visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information. - ### NVIDIA Nvidia users should install stable pytorch using this command: From 18e7d6dba5f1012d4cf09e8f777dc85d56ff25c0 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Thu, 13 Nov 2025 07:19:53 +1000 Subject: [PATCH 559/759] mm/mp: always unload re-used but modified models (#10724) The partial unloader path in model re-use flow skips straight to the actual unload without any check of the patching UUID. This means that if you do an upscale flow with a model patch on an existing model, it will not apply your patchings. Fix by delaying the partial_unload until after the uuid checks. This is done by making partial_unload a model of partial_load where extra_mem is -ve. --- comfy/model_management.py | 5 +---- comfy/model_patcher.py | 3 +++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index d8913082a..a21df54b3 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -503,10 +503,7 @@ class LoadedModel: use_more_vram = lowvram_model_memory if use_more_vram == 0: use_more_vram = 1e32 - if use_more_vram > 0: - self.model_use_more_vram(use_more_vram, force_patch_weights=force_patch_weights) - else: - self.model.partially_unload(self.model.offload_device, -use_more_vram, force_patch_weights=force_patch_weights) + self.model_use_more_vram(use_more_vram, force_patch_weights=force_patch_weights) real_model = self.model.model diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 68b0a9192..cf1b0d441 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -928,6 +928,9 @@ class ModelPatcher: extra_memory += (used - self.model.model_loaded_weight_memory) self.patch_model(load_weights=False) + if extra_memory < 0 and not unpatch_weights: + self.partially_unload(self.offload_device, -extra_memory, force_patch_weights=force_patch_weights) + return 0 full_load = False if self.model.model_lowvram == False and self.model.model_loaded_weight_memory > 0: self.apply_hooks(self.forced_hooks, force_apply=True) From 1c7eaeca1013e4315f36e0d4d274faa106001121 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Thu, 13 Nov 2025 07:20:53 +1000 Subject: [PATCH 560/759] qwen: reduce VRAM usage (#10725) Clean up a bunch of stacked and no-longer-needed tensors on the QWEN VRAM peak (currently FFN). With this I go from OOMing at B=37x1328x1328 to being able to succesfully run B=47 (RTX5090). --- comfy/ldm/qwen_image/model.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index e5d0d17c1..427ea19c1 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -236,10 +236,10 @@ class QwenImageTransformerBlock(nn.Module): img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1) txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1) - img_normed = self.img_norm1(hidden_states) - img_modulated, img_gate1 = self._modulate(img_normed, img_mod1) - txt_normed = self.txt_norm1(encoder_hidden_states) - txt_modulated, txt_gate1 = self._modulate(txt_normed, txt_mod1) + img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1) + del img_mod1 + txt_modulated, txt_gate1 = self._modulate(self.txt_norm1(encoder_hidden_states), txt_mod1) + del txt_mod1 img_attn_output, txt_attn_output = self.attn( hidden_states=img_modulated, @@ -248,16 +248,20 @@ class QwenImageTransformerBlock(nn.Module): image_rotary_emb=image_rotary_emb, transformer_options=transformer_options, ) + del img_modulated + del txt_modulated hidden_states = hidden_states + img_gate1 * img_attn_output encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output + del img_attn_output + del txt_attn_output + del img_gate1 + del txt_gate1 - img_normed2 = self.img_norm2(hidden_states) - img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2) + img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2) hidden_states = torch.addcmul(hidden_states, img_gate2, self.img_mlp(img_modulated2)) - txt_normed2 = self.txt_norm2(encoder_hidden_states) - txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2) + txt_modulated2, txt_gate2 = self._modulate(self.txt_norm2(encoder_hidden_states), txt_mod2) encoder_hidden_states = torch.addcmul(encoder_hidden_states, txt_gate2, self.txt_mlp(txt_modulated2)) return encoder_hidden_states, hidden_states From 8b0b93df51d04f08eb779cb84dc331fa18b43ae8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 12 Nov 2025 14:04:41 -0800 Subject: [PATCH 561/759] Update Python 3.14 compatibility notes in README (#10730) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9e28803a2..f51807ad5 100644 --- a/README.md +++ b/README.md @@ -200,7 +200,7 @@ comfy install ## Manual Install (Windows, Linux) -Python 3.14 will work if you comment out the `kornia` dependency in the requirements.txt file (breaks the canny node) but it is not recommended. +Python 3.14 works but you may encounter issues with the torch compile node. The free threaded variant is still missing some dependencies. Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12 From 3b3ef9a77ac03ed516a45063f9736f33085cecca Mon Sep 17 00:00:00 2001 From: contentis Date: Thu, 13 Nov 2025 00:26:52 +0100 Subject: [PATCH 562/759] Quantized Ops fixes (#10715) * offload support, bug fixes, remove mixins * add readme --- QUANTIZATION.md | 168 +++++++++++++++++++++++++++++++++++++++++++++ comfy/ops.py | 37 ++++------ comfy/quant_ops.py | 39 ++++++++++- 3 files changed, 219 insertions(+), 25 deletions(-) create mode 100644 QUANTIZATION.md diff --git a/QUANTIZATION.md b/QUANTIZATION.md new file mode 100644 index 000000000..1693e13f3 --- /dev/null +++ b/QUANTIZATION.md @@ -0,0 +1,168 @@ +# The Comfy guide to Quantization + + +## How does quantization work? + +Quantization aims to map a high-precision value x_f to a lower precision format with minimal loss in accuracy. These smaller formats then serve to reduce the models memory footprint and increase throughput by using specialized hardware. + +When simply converting a value from FP16 to FP8 using the round-nearest method we might hit two issues: +- The dynamic range of FP16 (-65,504, 65,504) far exceeds FP8 formats like E4M3 (-448, 448) or E5M2 (-57,344, 57,344), potentially resulting in clipped values +- The original values are concentrated in a small range (e.g. -1,1) leaving many FP8-bits "unused" + +By using a scaling factor, we aim to map these values into the quantized-dtype range, making use of the full spectrum. One of the easiest approaches, and common, is using per-tensor absolute-maximum scaling. + +``` +absmax = max(abs(tensor)) +scale = amax / max_dynamic_range_low_precision + +# Quantization +tensor_q = (tensor / scale).to(low_precision_dtype) + +# De-Quantization +tensor_dq = tensor_q.to(fp16) * scale + +tensor_dq ~ tensor +``` + +Given that additional information (scaling factor) is needed to "interpret" the quantized values, we describe those as derived datatypes. + + +## Quantization in Comfy + +``` +QuantizedTensor (torch.Tensor subclass) + ↓ __torch_dispatch__ +Two-Level Registry (generic + layout handlers) + ↓ +MixedPrecisionOps + Metadata Detection +``` + +### Representation + +To represent these derived datatypes, ComfyUI uses a subclass of torch.Tensor to implements these using the `QuantizedTensor` class found in `comfy/quant_ops.py` + +A `Layout` class defines how a specific quantization format behaves: +- Required parameters +- Quantize method +- De-Quantize method + +```python +from comfy.quant_ops import QuantizedLayout + +class MyLayout(QuantizedLayout): + @classmethod + def quantize(cls, tensor, **kwargs): + # Convert to quantized format + qdata = ... + params = {'scale': ..., 'orig_dtype': tensor.dtype} + return qdata, params + + @staticmethod + def dequantize(qdata, scale, orig_dtype, **kwargs): + return qdata.to(orig_dtype) * scale +``` + +To then run operations using these QuantizedTensors we use two registry systems to define supported operations. +The first is a **generic registry** that handles operations common to all quantized formats (e.g., `.to()`, `.clone()`, `.reshape()`). + +The second registry is layout-specific and allows to implement fast-paths like nn.Linear. +```python +from comfy.quant_ops import register_layout_op + +@register_layout_op(torch.ops.aten.linear.default, MyLayout) +def my_linear(func, args, kwargs): + # Extract tensors, call optimized kernel + ... +``` +When `torch.nn.functional.linear()` is called with QuantizedTensor arguments, `__torch_dispatch__` automatically routes to the registered implementation. +For any unsupported operation, QuantizedTensor will fallback to call `dequantize` and dispatch using the high-precision implementation. + + +### Mixed Precision + +The `MixedPrecisionOps` class (lines 542-648 in `comfy/ops.py`) enables per-layer quantization decisions, allowing different layers in a model to use different precisions. This is activated when a model config contains a `layer_quant_config` dictionary that specifies which layers should be quantized and how. + +**Architecture:** + +```python +class MixedPrecisionOps(disable_weight_init): + _layer_quant_config = {} # Maps layer names to quantization configs + _compute_dtype = torch.bfloat16 # Default compute / dequantize precision +``` + +**Key mechanism:** + +The custom `Linear._load_from_state_dict()` method inspects each layer during model loading: +- If the layer name is **not** in `_layer_quant_config`: load weight as regular tensor in `_compute_dtype` +- If the layer name **is** in `_layer_quant_config`: + - Load weight as `QuantizedTensor` with the specified layout (e.g., `TensorCoreFP8Layout`) + - Load associated quantization parameters (scales, block_size, etc.) + +**Why it's needed:** + +Not all layers tolerate quantization equally. Sensitive operations like final projections can be kept in higher precision, while compute-heavy matmuls are quantized. This provides most of the performance benefits while maintaining quality. + +The system is selected in `pick_operations()` when `model_config.layer_quant_config` is present, making it the highest-priority operation mode. + + +## Checkpoint Format + +Quantized checkpoints are stored as standard safetensors files with quantized weight tensors and associated scaling parameters, plus a `_quantization_metadata` JSON entry describing the quantization scheme. + +The quantized checkpoint will contain the same layers as the original checkpoint but: +- The weights are stored as quantized values, sometimes using a different storage datatype. E.g. uint8 container for fp8. +- For each quantized weight a number of additional scaling parameters are stored alongside depending on the recipe. +- We store a metadata.json in the metadata of the final safetensor containing the `_quantization_metadata` describing which layers are quantized and what layout has been used. + +### Scaling Parameters details +We define 4 possible scaling parameters that should cover most recipes in the near-future: +- **weight_scale**: quantization scalers for the weights +- **weight_scale_2**: global scalers in the context of double scaling +- **pre_quant_scale**: scalers used for smoothing salient weights +- **input_scale**: quantization scalers for the activations + +| Format | Storage dtype | weight_scale | weight_scale_2 | pre_quant_scale | input_scale | +|--------|---------------|--------------|----------------|-----------------|-------------| +| float8_e4m3fn | float32 | float32 (scalar) | - | - | float32 (scalar) | + +You can find the defined formats in `comfy/quant_ops.py` (QUANT_ALGOS). + +### Quantization Metadata + +The metadata stored alongside the checkpoint contains: +- **format_version**: String to define a version of the standard +- **layers**: A dictionary mapping layer names to their quantization format. The format string maps to the definitions found in `QUANT_ALGOS`. + +Example: +```json +{ + "_quantization_metadata": { + "format_version": "1.0", + "layers": { + "model.layers.0.mlp.up_proj": "float8_e4m3fn", + "model.layers.0.mlp.down_proj": "float8_e4m3fn", + "model.layers.1.mlp.up_proj": "float8_e4m3fn" + } + } +} +``` + + +## Creating Quantized Checkpoints + +To create compatible checkpoints, use any quantization tool provided the output follows the checkpoint format described above and uses a layout defined in `QUANT_ALGOS`. + +### Weight Quantization + +Weight quantization is straightforward - compute the scaling factor directly from the weight tensor using the absolute maximum method described earlier. Each layer's weights are quantized independently and stored with their corresponding `weight_scale` parameter. + +### Calibration (for Activation Quantization) + +Activation quantization (e.g., for FP8 Tensor Core operations) requires `input_scale` parameters that cannot be determined from static weights alone. Since activation values depend on actual inputs, we use **post-training calibration (PTQ)**: + +1. **Collect statistics**: Run inference on N representative samples +2. **Track activations**: Record the absolute maximum (`amax`) of inputs to each quantized layer +3. **Compute scales**: Derive `input_scale` from collected statistics +4. **Store in checkpoint**: Save `input_scale` parameters alongside weights + +The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters. \ No newline at end of file diff --git a/comfy/ops.py b/comfy/ops.py index 96dffa85d..2a90a5ba2 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -77,7 +77,10 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of # will add async-offload support to your cast and improve performance. if input is not None: if dtype is None: - dtype = input.dtype + if isinstance(input, QuantizedTensor): + dtype = input._layout_params["orig_dtype"] + else: + dtype = input.dtype if bias_dtype is None: bias_dtype = dtype if device is None: @@ -534,18 +537,7 @@ if CUBLAS_IS_AVAILABLE: # ============================================================================== # Mixed Precision Operations # ============================================================================== -from .quant_ops import QuantizedTensor - -QUANT_FORMAT_MIXINS = { - "float8_e4m3fn": { - "dtype": torch.float8_e4m3fn, - "layout_type": "TensorCoreFP8Layout", - "parameters": { - "weight_scale": torch.nn.Parameter(torch.zeros((), dtype=torch.float32), requires_grad=False), - "input_scale": torch.nn.Parameter(torch.zeros((), dtype=torch.float32), requires_grad=False), - } - } -} +from .quant_ops import QuantizedTensor, QUANT_ALGOS class MixedPrecisionOps(disable_weight_init): _layer_quant_config = {} @@ -596,23 +588,24 @@ class MixedPrecisionOps(disable_weight_init): if quant_format is None: raise ValueError(f"Unknown quantization format for layer {layer_name}") - mixin = QUANT_FORMAT_MIXINS[quant_format] - self.layout_type = mixin["layout_type"] + qconfig = QUANT_ALGOS[quant_format] + self.layout_type = qconfig["comfy_tensor_layout"] - scale_key = f"{prefix}weight_scale" + weight_scale_key = f"{prefix}weight_scale" layout_params = { - 'scale': state_dict.pop(scale_key, None), - 'orig_dtype': MixedPrecisionOps._compute_dtype + 'scale': state_dict.pop(weight_scale_key, None), + 'orig_dtype': MixedPrecisionOps._compute_dtype, + 'block_size': qconfig.get("group_size", None), } if layout_params['scale'] is not None: - manually_loaded_keys.append(scale_key) + manually_loaded_keys.append(weight_scale_key) self.weight = torch.nn.Parameter( - QuantizedTensor(weight.to(device=device, dtype=mixin["dtype"]), self.layout_type, layout_params), + QuantizedTensor(weight.to(device=device), self.layout_type, layout_params), requires_grad=False ) - for param_name, param_value in mixin["parameters"].items(): + for param_name in qconfig["parameters"]: param_key = f"{prefix}{param_name}" _v = state_dict.pop(param_key, None) if _v is None: @@ -643,7 +636,7 @@ class MixedPrecisionOps(disable_weight_init): if (getattr(self, 'layout_type', None) is not None and getattr(self, 'input_scale', None) is not None and not isinstance(input, QuantizedTensor)): - input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, fp8_dtype=self.weight.dtype) + input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, dtype=self.weight.dtype) return self._forward(input, self.weight, self.bias) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index c56e32a73..1d058bece 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -74,6 +74,12 @@ def _copy_layout_params(params): new_params[k] = v return new_params +def _copy_layout_params_inplace(src, dst, non_blocking=False): + for k, v in src.items(): + if isinstance(v, torch.Tensor): + dst[k].copy_(v, non_blocking=non_blocking) + else: + dst[k] = v class QuantizedLayout: """ @@ -318,13 +324,13 @@ def generic_to_dtype_layout(func, args, kwargs): def generic_copy_(func, args, kwargs): qt_dest = args[0] src = args[1] - + non_blocking = args[2] if len(args) > 2 else False if isinstance(qt_dest, QuantizedTensor): if isinstance(src, QuantizedTensor): # Copy from another quantized tensor - qt_dest._qdata.copy_(src._qdata) + qt_dest._qdata.copy_(src._qdata, non_blocking=non_blocking) qt_dest._layout_type = src._layout_type - qt_dest._layout_params = _copy_layout_params(src._layout_params) + _copy_layout_params_inplace(src._layout_params, qt_dest._layout_params, non_blocking=non_blocking) else: # Copy from regular tensor - just copy raw data qt_dest._qdata.copy_(src) @@ -336,6 +342,26 @@ def generic_copy_(func, args, kwargs): def generic_has_compatible_shallow_copy_type(func, args, kwargs): return True + +@register_generic_util(torch.ops.aten.empty_like.default) +def generic_empty_like(func, args, kwargs): + """Empty_like operation - creates an empty tensor with the same quantized structure.""" + qt = args[0] + if isinstance(qt, QuantizedTensor): + # Create empty tensor with same shape and dtype as the quantized data + hp_dtype = kwargs.pop('dtype', qt._layout_params["orig_dtype"]) + new_qdata = torch.empty_like(qt._qdata, **kwargs) + + # Handle device transfer for layout params + target_device = kwargs.get('device', new_qdata.device) + new_params = _move_layout_params_to_device(qt._layout_params, target_device) + + # Update orig_dtype if dtype is specified + new_params['orig_dtype'] = hp_dtype + + return QuantizedTensor(new_qdata, qt._layout_type, new_params) + return func(*args, **kwargs) + # ============================================================================== # FP8 Layout + Operation Handlers # ============================================================================== @@ -378,6 +404,13 @@ class TensorCoreFP8Layout(QuantizedLayout): def get_plain_tensors(cls, qtensor): return qtensor._qdata, qtensor._layout_params['scale'] +QUANT_ALGOS = { + "float8_e4m3fn": { + "storage_t": torch.float8_e4m3fn, + "parameters": {"weight_scale", "input_scale"}, + "comfy_tensor_layout": "TensorCoreFP8Layout", + }, +} LAYOUTS = { "TensorCoreFP8Layout": TensorCoreFP8Layout, From f91078b1ffa484c424f78814f54de4d5846e4daa Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 13 Nov 2025 20:05:26 +0200 Subject: [PATCH 563/759] add PR template for API-Nodes (#10736) --- .github/PULL_REQUEST_TEMPLATE/api-node.md | 21 ++++++++ .github/workflows/api-node-template.yml | 58 +++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 .github/PULL_REQUEST_TEMPLATE/api-node.md create mode 100644 .github/workflows/api-node-template.yml diff --git a/.github/PULL_REQUEST_TEMPLATE/api-node.md b/.github/PULL_REQUEST_TEMPLATE/api-node.md new file mode 100644 index 000000000..f62744878 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/api-node.md @@ -0,0 +1,21 @@ + + +## API Node PR Checklist + +### Scope +- [ ] **Is API Node Change** + +### Pricing & Billing +- [ ] **Need pricing update** +- [ ] **No pricing update** + +If **Need pricing update**: +- [ ] Metronome rate cards updated +- [ ] Auto‑billing tests updated and passing + +### QA +- [ ] **QA done** +- [ ] **QA not required** + +### Comms +- [ ] Informed **@Kosinkadink** diff --git a/.github/workflows/api-node-template.yml b/.github/workflows/api-node-template.yml new file mode 100644 index 000000000..0775f9979 --- /dev/null +++ b/.github/workflows/api-node-template.yml @@ -0,0 +1,58 @@ +name: Append API Node PR template + +on: + pull_request_target: + types: [opened, reopened, synchronize, edited, ready_for_review] + paths: + - 'comfy_api_nodes/**' # only run if these files changed + +permissions: + contents: read + pull-requests: write + +jobs: + inject: + runs-on: ubuntu-latest + steps: + - name: Ensure template exists and append to PR body + uses: actions/github-script@v7 + with: + script: | + const { owner, repo } = context.repo; + const number = context.payload.pull_request.number; + const templatePath = '.github/PULL_REQUEST_TEMPLATE/api-node.md'; + const marker = ''; + + const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: number }); + + let templateText; + try { + const res = await github.rest.repos.getContent({ + owner, + repo, + path: templatePath, + ref: pr.base.ref + }); + const buf = Buffer.from(res.data.content, res.data.encoding || 'base64'); + templateText = buf.toString('utf8'); + } catch (e) { + core.setFailed(`Required PR template not found at "${templatePath}" on ${pr.base.ref}. Please add it to the repo.`); + return; + } + + // Enforce the presence of the marker inside the template (for idempotence) + if (!templateText.includes(marker)) { + core.setFailed(`Template at "${templatePath}" does not contain the required marker:\n${marker}\nAdd it so we can detect duplicates safely.`); + return; + } + + // If the PR already contains the marker, do not append again. + const body = pr.body || ''; + if (body.includes(marker)) { + core.info('Template already present in PR body; nothing to inject.'); + return; + } + + const newBody = (body ? body + '\n\n' : '') + templateText + '\n'; + await github.rest.pulls.update({ owner, repo, pull_number: number, body: newBody }); + core.notice('API Node template appended to PR description.'); From 2fde9597f4b02c5f06c1a5ceb3ca2fa6d74966ec Mon Sep 17 00:00:00 2001 From: ric-yu Date: Thu, 13 Nov 2025 15:11:52 -0800 Subject: [PATCH 564/759] feat: add create_time dict to prompt field in /history and /queue (#10741) --- server.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server.py b/server.py index 5d773b10a..d059d3dc9 100644 --- a/server.py +++ b/server.py @@ -2,6 +2,7 @@ import os import sys import asyncio import traceback +import time import nodes import folder_paths @@ -733,6 +734,7 @@ class PromptServer(): for sensitive_val in execution.SENSITIVE_EXTRA_DATA_KEYS: if sensitive_val in extra_data: sensitive[sensitive_val] = extra_data.pop(sensitive_val) + extra_data["create_time"] = int(time.time() * 1000) # timestamp in milliseconds self.prompt_queue.put((number, prompt_id, prompt, extra_data, outputs_to_execute, sensitive)) response = {"prompt_id": prompt_id, "number": number, "node_errors": valid[3]} return web.json_response(response) From 94c298f9625b0fd9af8ea07a73075fdefe0d9e57 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Fri, 14 Nov 2025 10:02:03 +1000 Subject: [PATCH 565/759] flux: reduce VRAM usage (#10737) Cleanup a bunch of stack tensors on Flux. This take me from B=19 to B=22 for 1600x1600 on RTX5090. --- comfy/ldm/flux/layers.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py index a3eab0470..f4bf56e01 100644 --- a/comfy/ldm/flux/layers.py +++ b/comfy/ldm/flux/layers.py @@ -167,39 +167,55 @@ class DoubleStreamBlock(nn.Module): img_modulated = self.img_norm1(img) img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img) img_qkv = self.img_attn.qkv(img_modulated) + del img_modulated img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + del img_qkv img_q, img_k = self.img_attn.norm(img_q, img_k, img_v) # prepare txt for attention txt_modulated = self.txt_norm1(txt) txt_modulated = apply_mod(txt_modulated, (1 + txt_mod1.scale), txt_mod1.shift, modulation_dims_txt) txt_qkv = self.txt_attn.qkv(txt_modulated) + del txt_modulated txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + del txt_qkv txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v) if self.flipped_img_txt: + q = torch.cat((img_q, txt_q), dim=2) + del img_q, txt_q + k = torch.cat((img_k, txt_k), dim=2) + del img_k, txt_k + v = torch.cat((img_v, txt_v), dim=2) + del img_v, txt_v # run actual attention - attn = attention(torch.cat((img_q, txt_q), dim=2), - torch.cat((img_k, txt_k), dim=2), - torch.cat((img_v, txt_v), dim=2), + attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) + del q, k, v img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:] else: + q = torch.cat((txt_q, img_q), dim=2) + del txt_q, img_q + k = torch.cat((txt_k, img_k), dim=2) + del txt_k, img_k + v = torch.cat((txt_v, img_v), dim=2) + del txt_v, img_v # run actual attention - attn = attention(torch.cat((txt_q, img_q), dim=2), - torch.cat((txt_k, img_k), dim=2), - torch.cat((txt_v, img_v), dim=2), + attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) + del q, k, v txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:] # calculate the img bloks img += apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img) + del img_attn img += apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img) # calculate the txt bloks txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt) + del txt_attn txt += apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt) if txt.dtype == torch.float16: @@ -249,12 +265,15 @@ class SingleStreamBlock(nn.Module): qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + del qkv q, k = self.norm(q, k, v) # compute attention attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) + del q, k, v # compute activation in mlp stream, cat again and run second linear layer - output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) + mlp = self.mlp_act(mlp) + output = self.linear2(torch.cat((attn, mlp), 2)) x += apply_mod(output, mod.gate, None, modulation_dims) if x.dtype == torch.float16: x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504) From 1ef328c007a419c2c429df0f80532cc11579dc97 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 13 Nov 2025 18:32:39 -0800 Subject: [PATCH 566/759] Better instructions for the portable. (#10743) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f51807ad5..cd8273b0d 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ There is a portable standalone build for Windows that should work for running on ### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z) -Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints +Simply download, extract with [7-Zip](https://7-zip.org) or with the windows explorer on recent windows versions and run. For smaller models you normally only need to put the checkpoints (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints but many of the larger models have multiple files. Make sure to follow the instructions to know which subfolder to put them in ComfyUI\models\ If you have trouble extracting it, right click the file -> properties -> unblock From f60923590c3f2fd05e166e2ec57968aaf7007dd0 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 13 Nov 2025 22:28:05 -0800 Subject: [PATCH 567/759] Use same code for chroma and flux blocks so that optimizations are shared. (#10746) --- comfy/ldm/chroma/layers.py | 121 ----------------------------- comfy/ldm/chroma/model.py | 7 +- comfy/ldm/chroma_radiance/model.py | 7 +- comfy/ldm/flux/layers.py | 31 ++++++-- 4 files changed, 31 insertions(+), 135 deletions(-) diff --git a/comfy/ldm/chroma/layers.py b/comfy/ldm/chroma/layers.py index fc7110cce..9f4ad5bd2 100644 --- a/comfy/ldm/chroma/layers.py +++ b/comfy/ldm/chroma/layers.py @@ -1,12 +1,9 @@ import torch from torch import Tensor, nn -from comfy.ldm.flux.math import attention from comfy.ldm.flux.layers import ( MLPEmbedder, RMSNorm, - QKNorm, - SelfAttention, ModulationOut, ) @@ -48,124 +45,6 @@ class Approximator(nn.Module): return x -class DoubleStreamBlock(nn.Module): - def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None): - super().__init__() - - mlp_hidden_dim = int(hidden_size * mlp_ratio) - self.num_heads = num_heads - self.hidden_size = hidden_size - self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations) - - self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.img_mlp = nn.Sequential( - operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), - nn.GELU(approximate="tanh"), - operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), - ) - - self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations) - - self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.txt_mlp = nn.Sequential( - operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), - nn.GELU(approximate="tanh"), - operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), - ) - self.flipped_img_txt = flipped_img_txt - - def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}): - (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec - - # prepare image for attention - img_modulated = torch.addcmul(img_mod1.shift, 1 + img_mod1.scale, self.img_norm1(img)) - img_qkv = self.img_attn.qkv(img_modulated) - img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) - img_q, img_k = self.img_attn.norm(img_q, img_k, img_v) - - # prepare txt for attention - txt_modulated = torch.addcmul(txt_mod1.shift, 1 + txt_mod1.scale, self.txt_norm1(txt)) - txt_qkv = self.txt_attn.qkv(txt_modulated) - txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) - txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v) - - # run actual attention - attn = attention(torch.cat((txt_q, img_q), dim=2), - torch.cat((txt_k, img_k), dim=2), - torch.cat((txt_v, img_v), dim=2), - pe=pe, mask=attn_mask, transformer_options=transformer_options) - - txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :] - - # calculate the img bloks - img.addcmul_(img_mod1.gate, self.img_attn.proj(img_attn)) - img.addcmul_(img_mod2.gate, self.img_mlp(torch.addcmul(img_mod2.shift, 1 + img_mod2.scale, self.img_norm2(img)))) - - # calculate the txt bloks - txt.addcmul_(txt_mod1.gate, self.txt_attn.proj(txt_attn)) - txt.addcmul_(txt_mod2.gate, self.txt_mlp(torch.addcmul(txt_mod2.shift, 1 + txt_mod2.scale, self.txt_norm2(txt)))) - - if txt.dtype == torch.float16: - txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504) - - return img, txt - - -class SingleStreamBlock(nn.Module): - """ - A DiT block with parallel linear layers as described in - https://arxiv.org/abs/2302.05442 and adapted modulation interface. - """ - - def __init__( - self, - hidden_size: int, - num_heads: int, - mlp_ratio: float = 4.0, - qk_scale: float = None, - dtype=None, - device=None, - operations=None - ): - super().__init__() - self.hidden_dim = hidden_size - self.num_heads = num_heads - head_dim = hidden_size // num_heads - self.scale = qk_scale or head_dim**-0.5 - - self.mlp_hidden_dim = int(hidden_size * mlp_ratio) - # qkv and mlp_in - self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device) - # proj and mlp_out - self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device) - - self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations) - - self.hidden_size = hidden_size - self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - - self.mlp_act = nn.GELU(approximate="tanh") - - def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}) -> Tensor: - mod = vec - x_mod = torch.addcmul(mod.shift, 1 + mod.scale, self.pre_norm(x)) - qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) - - q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) - q, k = self.norm(q, k, v) - - # compute attention - attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) - # compute activation in mlp stream, cat again and run second linear layer - output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) - x.addcmul_(mod.gate, output) - if x.dtype == torch.float16: - x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504) - return x - - class LastLayer(nn.Module): def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None): super().__init__() diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py index ad1c523fe..67bf70eb1 100644 --- a/comfy/ldm/chroma/model.py +++ b/comfy/ldm/chroma/model.py @@ -11,12 +11,12 @@ import comfy.ldm.common_dit from comfy.ldm.flux.layers import ( EmbedND, timestep_embedding, + DoubleStreamBlock, + SingleStreamBlock, ) from .layers import ( - DoubleStreamBlock, LastLayer, - SingleStreamBlock, Approximator, ChromaModulationOut, ) @@ -90,6 +90,7 @@ class Chroma(nn.Module): self.num_heads, mlp_ratio=params.mlp_ratio, qkv_bias=params.qkv_bias, + modulation=False, dtype=dtype, device=device, operations=operations ) for _ in range(params.depth) @@ -98,7 +99,7 @@ class Chroma(nn.Module): self.single_blocks = nn.ModuleList( [ - SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations) + SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=False, dtype=dtype, device=device, operations=operations) for _ in range(params.depth_single_blocks) ] ) diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py index 7d7be80f5..e643b4414 100644 --- a/comfy/ldm/chroma_radiance/model.py +++ b/comfy/ldm/chroma_radiance/model.py @@ -10,12 +10,10 @@ from torch import Tensor, nn from einops import repeat import comfy.ldm.common_dit -from comfy.ldm.flux.layers import EmbedND +from comfy.ldm.flux.layers import EmbedND, DoubleStreamBlock, SingleStreamBlock from comfy.ldm.chroma.model import Chroma, ChromaParams from comfy.ldm.chroma.layers import ( - DoubleStreamBlock, - SingleStreamBlock, Approximator, ) from .layers import ( @@ -89,7 +87,6 @@ class ChromaRadiance(Chroma): dtype=dtype, device=device, operations=operations ) - self.double_blocks = nn.ModuleList( [ DoubleStreamBlock( @@ -97,6 +94,7 @@ class ChromaRadiance(Chroma): self.num_heads, mlp_ratio=params.mlp_ratio, qkv_bias=params.qkv_bias, + modulation=False, dtype=dtype, device=device, operations=operations ) for _ in range(params.depth) @@ -109,6 +107,7 @@ class ChromaRadiance(Chroma): self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, + modulation=False, dtype=dtype, device=device, operations=operations, ) for _ in range(params.depth_single_blocks) diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py index f4bf56e01..23150a712 100644 --- a/comfy/ldm/flux/layers.py +++ b/comfy/ldm/flux/layers.py @@ -130,13 +130,17 @@ def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None): class DoubleStreamBlock(nn.Module): - def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None): + def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, dtype=None, device=None, operations=None): super().__init__() mlp_hidden_dim = int(hidden_size * mlp_ratio) self.num_heads = num_heads self.hidden_size = hidden_size - self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations) + self.modulation = modulation + + if self.modulation: + self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations) + self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations) @@ -147,7 +151,9 @@ class DoubleStreamBlock(nn.Module): operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), ) - self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations) + if self.modulation: + self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations) + self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations) @@ -160,8 +166,11 @@ class DoubleStreamBlock(nn.Module): self.flipped_img_txt = flipped_img_txt def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}): - img_mod1, img_mod2 = self.img_mod(vec) - txt_mod1, txt_mod2 = self.txt_mod(vec) + if self.modulation: + img_mod1, img_mod2 = self.img_mod(vec) + txt_mod1, txt_mod2 = self.txt_mod(vec) + else: + (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec # prepare image for attention img_modulated = self.img_norm1(img) @@ -236,6 +245,7 @@ class SingleStreamBlock(nn.Module): num_heads: int, mlp_ratio: float = 4.0, qk_scale: float = None, + modulation=True, dtype=None, device=None, operations=None @@ -258,10 +268,17 @@ class SingleStreamBlock(nn.Module): self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) self.mlp_act = nn.GELU(approximate="tanh") - self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations) + if modulation: + self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations) + else: + self.modulation = None def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None, transformer_options={}) -> Tensor: - mod, _ = self.modulation(vec) + if self.modulation: + mod, _ = self.modulation(vec) + else: + mod = vec + qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) From 443056c401c53953bb8eee6da71b9ad29afe2581 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 14 Nov 2025 00:26:05 -0800 Subject: [PATCH 568/759] Fix custom nodes import error. (#10747) This should fix the import errors but will break if the custom nodes actually try to use the class. --- comfy/ldm/chroma/layers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy/ldm/chroma/layers.py b/comfy/ldm/chroma/layers.py index 9f4ad5bd2..2d5684348 100644 --- a/comfy/ldm/chroma/layers.py +++ b/comfy/ldm/chroma/layers.py @@ -7,6 +7,9 @@ from comfy.ldm.flux.layers import ( ModulationOut, ) +# TODO: remove this in a few months +SingleStreamBlock = None +DoubleStreamBlock = None class ChromaModulationOut(ModulationOut): From bd01d9f7fd241a45bd08b60dfedbe78577383cc4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 15 Nov 2025 03:54:40 -0800 Subject: [PATCH 569/759] Add left padding support to tokenizers. (#10753) --- comfy/sd1_clip.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index f8a7c2a1b..3066de2d7 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -460,7 +460,7 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No return embed_out class SDTokenizer: - def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data={}, tokenizer_args={}): + def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, pad_left=False, tokenizer_data={}, tokenizer_args={}): if tokenizer_path is None: tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer") self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path, **tokenizer_args) @@ -468,6 +468,7 @@ class SDTokenizer: self.min_length = tokenizer_data.get("{}_min_length".format(embedding_key), min_length) self.end_token = None self.min_padding = min_padding + self.pad_left = pad_left empty = self.tokenizer('')["input_ids"] self.tokenizer_adds_end_token = has_end_token @@ -522,6 +523,12 @@ class SDTokenizer: return (embed, "{} {}".format(embedding_name[len(stripped):], leftover)) return (embed, leftover) + def pad_tokens(self, tokens, amount): + if self.pad_left: + for i in range(amount): + tokens.insert(0, (self.pad_token, 1.0, 0)) + else: + tokens.extend([(self.pad_token, 1.0, 0)] * amount) def tokenize_with_weights(self, text:str, return_word_ids=False, tokenizer_options={}, **kwargs): ''' @@ -600,7 +607,7 @@ class SDTokenizer: if self.end_token is not None: batch.append((self.end_token, 1.0, 0)) if self.pad_to_max_length: - batch.extend([(self.pad_token, 1.0, 0)] * (remaining_length)) + self.pad_tokens(batch, remaining_length) #start new batch batch = [] if self.start_token is not None: @@ -614,11 +621,11 @@ class SDTokenizer: if self.end_token is not None: batch.append((self.end_token, 1.0, 0)) if min_padding is not None: - batch.extend([(self.pad_token, 1.0, 0)] * min_padding) + self.pad_tokens(batch, min_padding) if self.pad_to_max_length and len(batch) < self.max_length: - batch.extend([(self.pad_token, 1.0, 0)] * (self.max_length - len(batch))) + self.pad_tokens(batch, self.max_length - len(batch)) if min_length is not None and len(batch) < min_length: - batch.extend([(self.pad_token, 1.0, 0)] * (min_length - len(batch))) + self.pad_tokens(batch, min_length - len(batch)) if not return_word_ids: batched_tokens = [[(t, w) for t, w,_ in x] for x in batched_tokens] From 9a0238256873711bd38ce0e0b1d15a617a1ee454 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 15 Nov 2025 21:18:49 +0200 Subject: [PATCH 570/759] chore(api-nodes): mark OpenAIDalle2 and OpenAIDalle3 nodes as deprecated (#10757) --- comfy_api_nodes/nodes_openai.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index acf35d276..e08bec08c 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -169,6 +169,7 @@ class OpenAIDalle2(IO.ComfyNode): IO.Hidden.unique_id, ], is_api_node=True, + is_deprecated=True, ) @classmethod @@ -299,6 +300,7 @@ class OpenAIDalle3(IO.ComfyNode): IO.Hidden.unique_id, ], is_api_node=True, + is_deprecated=True, ) @classmethod From 2d4a08b717c492fa45e98bd70beb48d4e77cb464 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 15 Nov 2025 22:37:34 +0200 Subject: [PATCH 571/759] Revert "chore(api-nodes): mark OpenAIDalle2 and OpenAIDalle3 nodes as deprecated (#10757)" (#10759) This reverts commit 9a0238256873711bd38ce0e0b1d15a617a1ee454. --- comfy_api_nodes/nodes_openai.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index e08bec08c..acf35d276 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -169,7 +169,6 @@ class OpenAIDalle2(IO.ComfyNode): IO.Hidden.unique_id, ], is_api_node=True, - is_deprecated=True, ) @classmethod @@ -300,7 +299,6 @@ class OpenAIDalle3(IO.ComfyNode): IO.Hidden.unique_id, ], is_api_node=True, - is_deprecated=True, ) @classmethod From 7d6103325e1c97aa54f963253e3e7f1d6da6947f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 16 Nov 2025 00:01:14 -0800 Subject: [PATCH 572/759] Change ROCm nightly install command to 7.1 (#10764) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cd8273b0d..c0384099d 100644 --- a/README.md +++ b/README.md @@ -221,7 +221,7 @@ AMD users can install rocm and pytorch with pip if you don't have it already ins This is the command to install the nightly with ROCm 7.0 which might have some performance improvements: -```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.0``` +```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.1``` ### AMD GPUs (Experimental: Windows and Linux), RDNA 3, 3.5 and 4 only. From 3d0003c24c1aec9f0c021dbc70ffb7cd8cf0685c Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 17 Nov 2025 17:17:24 -0500 Subject: [PATCH 573/759] ComfyUI version 0.3.69 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 25d1a4157..1e554eb9f 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.68" +__version__ = "0.3.69" diff --git a/pyproject.toml b/pyproject.toml index 79ff3f74a..63778286f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.68" +version = "0.3.69" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 27cbac865ec226cfd9c1563327b0d62cf5dbd484 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 17 Nov 2025 16:04:04 -0800 Subject: [PATCH 574/759] Add release workflow for NVIDIA cu126 (#10777) --- .github/workflows/release-stable-all.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/release-stable-all.yml b/.github/workflows/release-stable-all.yml index 7dca7277b..f7de3a7c3 100644 --- a/.github/workflows/release-stable-all.yml +++ b/.github/workflows/release-stable-all.yml @@ -43,6 +43,23 @@ jobs: test_release: true secrets: inherit + release_nvidia_cu126: + permissions: + contents: "write" + packages: "write" + pull-requests: "read" + name: "Release NVIDIA cu126" + uses: ./.github/workflows/stable-release.yml + with: + git_tag: ${{ inputs.git_tag }} + cache_tag: "cu126" + python_minor: "12" + python_patch: "10" + rel_name: "nvidia" + rel_extra_name: "_cu126" + test_release: true + secrets: inherit + release_amd_rocm: permissions: contents: "write" From f41e5f398d5d4059a3c87cf157bd932afcce3c0d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 17 Nov 2025 16:59:19 -0800 Subject: [PATCH 575/759] Update README with new portable download link (#10778) --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c0384099d..323dfc587 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,9 @@ Update your Nvidia drivers if it doesn't start. [Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z) -[Portable with pytorch cuda 12.8 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z) (Supports Nvidia 10 series and older GPUs). +[Portable with pytorch cuda 12.8 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z). + +[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z) (Supports Nvidia 10 series and older GPUs). #### How do I share models between another UI and ComfyUI? From fdf49a28617f742d746ad209e57ed7420b3535dc Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 18 Nov 2025 11:04:06 +0800 Subject: [PATCH 576/759] Fix the portable download link for CUDA 12.6 (#10780) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 323dfc587..28beec427 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,7 @@ Update your Nvidia drivers if it doesn't start. [Portable with pytorch cuda 12.8 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z). -[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z) (Supports Nvidia 10 series and older GPUs). +[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs). #### How do I share models between another UI and ComfyUI? From 47bfd5a33fa984a1102fc2bd7b25c91a69ace288 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 17 Nov 2025 21:26:44 -0800 Subject: [PATCH 577/759] Native block swap custom nodes considered harmful. (#10783) --- comfy_extras/nodes_nop.py | 39 +++++++++++++++++++++++++++++++++++++++ nodes.py | 1 + 2 files changed, 40 insertions(+) create mode 100644 comfy_extras/nodes_nop.py diff --git a/comfy_extras/nodes_nop.py b/comfy_extras/nodes_nop.py new file mode 100644 index 000000000..953061bcb --- /dev/null +++ b/comfy_extras/nodes_nop.py @@ -0,0 +1,39 @@ +from comfy_api.latest import ComfyExtension, io +from typing_extensions import override +# If you write a node that is so useless that it breaks ComfyUI it will be featured in this exclusive list + +# "native" block swap nodes are placebo at best and break the ComfyUI memory management system. +# They are also considered harmful because instead of users reporting issues with the built in +# memory management they install these stupid nodes and complain even harder. Now it completely +# breaks with some of the new ComfyUI memory optimizations so I have made the decision to NOP it +# out of all workflows. +class wanBlockSwap(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="wanBlockSwap", + category="", + description="NOP", + inputs=[ + io.Model.Input("model"), + ], + outputs=[ + io.Model.Output(), + ], + is_deprecated=True, + ) + + @classmethod + def execute(cls, model) -> io.NodeOutput: + return io.NodeOutput(model) + + +class NopExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + wanBlockSwap + ] + +async def comfy_entrypoint() -> NopExtension: + return NopExtension() diff --git a/nodes.py b/nodes.py index 5689f6fe1..f6aeedc78 100644 --- a/nodes.py +++ b/nodes.py @@ -2330,6 +2330,7 @@ async def init_builtin_extra_nodes(): "nodes_easycache.py", "nodes_audio_encoder.py", "nodes_rope.py", + "nodes_nop.py", ] import_failed = [] From 048f49adbd19ac2d9c7c87682c832b7827a4b29d Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 18 Nov 2025 13:59:27 +0200 Subject: [PATCH 578/759] chore(api-nodes): adjusted PR template; set min python version for pylint to 3.10 (#10787) --- .github/PULL_REQUEST_TEMPLATE/api-node.md | 2 +- .github/workflows/api-node-template.yml | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE/api-node.md b/.github/PULL_REQUEST_TEMPLATE/api-node.md index f62744878..c1f1bafb1 100644 --- a/.github/PULL_REQUEST_TEMPLATE/api-node.md +++ b/.github/PULL_REQUEST_TEMPLATE/api-node.md @@ -18,4 +18,4 @@ If **Need pricing update**: - [ ] **QA not required** ### Comms -- [ ] Informed **@Kosinkadink** +- [ ] Informed **Kosinkadink** diff --git a/.github/workflows/api-node-template.yml b/.github/workflows/api-node-template.yml index 0775f9979..fdb81c0c5 100644 --- a/.github/workflows/api-node-template.yml +++ b/.github/workflows/api-node-template.yml @@ -2,7 +2,7 @@ name: Append API Node PR template on: pull_request_target: - types: [opened, reopened, synchronize, edited, ready_for_review] + types: [opened, reopened, synchronize, ready_for_review] paths: - 'comfy_api_nodes/**' # only run if these files changed diff --git a/pyproject.toml b/pyproject.toml index 63778286f..a14b383b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ lint.select = [ exclude = ["*.ipynb", "**/generated/*.pyi"] [tool.pylint] -master.py-version = "3.9" +master.py-version = "3.10" master.extension-pkg-allow-list = [ "pydantic", ] From e1ab6bb394b82fa654d5bc84043f97479d12f84c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Tue, 18 Nov 2025 17:00:21 +0200 Subject: [PATCH 579/759] EasyCache: Fix for mismatch in input/output channels with some models (#10788) Slices model input with output channels so the caching tracks only the noise channels, resolves channel mismatch with models like WanVideo I2V Also fix for slicing deprecation in pytorch 2.9 --- comfy_extras/nodes_easycache.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py index 1359e2f99..11b23ffdb 100644 --- a/comfy_extras/nodes_easycache.py +++ b/comfy_extras/nodes_easycache.py @@ -11,13 +11,13 @@ if TYPE_CHECKING: def easycache_forward_wrapper(executor, *args, **kwargs): # get values from args - x: torch.Tensor = args[0] transformer_options: dict[str] = args[-1] if not isinstance(transformer_options, dict): transformer_options = kwargs.get("transformer_options") if not transformer_options: transformer_options = args[-2] easycache: EasyCacheHolder = transformer_options["easycache"] + x: torch.Tensor = args[0][:, :easycache.output_channels] sigmas = transformer_options["sigmas"] uuids = transformer_options["uuids"] if sigmas is not None and easycache.is_past_end_timestep(sigmas): @@ -82,13 +82,13 @@ def easycache_forward_wrapper(executor, *args, **kwargs): def lazycache_predict_noise_wrapper(executor, *args, **kwargs): # get values from args - x: torch.Tensor = args[0] timestep: float = args[1] model_options: dict[str] = args[2] easycache: LazyCacheHolder = model_options["transformer_options"]["easycache"] if easycache.is_past_end_timestep(timestep): return executor(*args, **kwargs) # prepare next x_prev + x: torch.Tensor = args[0][:, :easycache.output_channels] next_x_prev = x input_change = None do_easycache = easycache.should_do_easycache(timestep) @@ -173,7 +173,7 @@ def easycache_sample_wrapper(executor, *args, **kwargs): class EasyCacheHolder: - def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False): + def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False, output_channels: int=None): self.name = "EasyCache" self.reuse_threshold = reuse_threshold self.start_percent = start_percent @@ -202,6 +202,7 @@ class EasyCacheHolder: self.allow_mismatch = True self.cut_from_start = True self.state_metadata = None + self.output_channels = output_channels def is_past_end_timestep(self, timestep: float) -> bool: return not (timestep[0] > self.end_t).item() @@ -264,7 +265,7 @@ class EasyCacheHolder: else: slicing.append(slice(None)) batch_slice = batch_slice + slicing - x[batch_slice] += self.uuid_cache_diffs[uuid].to(x.device) + x[tuple(batch_slice)] += self.uuid_cache_diffs[uuid].to(x.device) return x def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]): @@ -283,7 +284,7 @@ class EasyCacheHolder: else: slicing.append(slice(None)) skip_dim = False - x = x[slicing] + x = x[tuple(slicing)] diff = output - x batch_offset = diff.shape[0] // len(uuids) for i, uuid in enumerate(uuids): @@ -323,7 +324,7 @@ class EasyCacheHolder: return self def clone(self): - return EasyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose) + return EasyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose, output_channels=self.output_channels) class EasyCacheNode(io.ComfyNode): @@ -350,7 +351,7 @@ class EasyCacheNode(io.ComfyNode): @classmethod def execute(cls, model: io.Model.Type, reuse_threshold: float, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput: model = model.clone() - model.model_options["transformer_options"]["easycache"] = EasyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose) + model.model_options["transformer_options"]["easycache"] = EasyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose, output_channels=model.model.latent_format.latent_channels) model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "easycache", easycache_sample_wrapper) model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.CALC_COND_BATCH, "easycache", easycache_calc_cond_batch_wrapper) model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, "easycache", easycache_forward_wrapper) @@ -358,7 +359,7 @@ class EasyCacheNode(io.ComfyNode): class LazyCacheHolder: - def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False): + def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False, output_channels: int=None): self.name = "LazyCache" self.reuse_threshold = reuse_threshold self.start_percent = start_percent @@ -382,6 +383,7 @@ class LazyCacheHolder: self.approx_output_change_rates = [] self.total_steps_skipped = 0 self.state_metadata = None + self.output_channels = output_channels def has_cache_diff(self) -> bool: return self.cache_diff is not None @@ -456,7 +458,7 @@ class LazyCacheHolder: return self def clone(self): - return LazyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose) + return LazyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose, output_channels=self.output_channels) class LazyCacheNode(io.ComfyNode): @classmethod @@ -482,7 +484,7 @@ class LazyCacheNode(io.ComfyNode): @classmethod def execute(cls, model: io.Model.Type, reuse_threshold: float, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput: model = model.clone() - model.model_options["transformer_options"]["easycache"] = LazyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose) + model.model_options["transformer_options"]["easycache"] = LazyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose, output_channels=model.model.latent_format.latent_channels) model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "lazycache", easycache_sample_wrapper) model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, "lazycache", lazycache_predict_noise_wrapper) return io.NodeOutput(model) From d52697457608a045cafc3b6d6cb89f0a49ba0709 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 18 Nov 2025 13:46:19 -0800 Subject: [PATCH 580/759] Fix hunyuan 3d 2.0 (#10792) --- comfy/ldm/flux/math.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comfy/ldm/flux/math.py b/comfy/ldm/flux/math.py index 158420290..6a22df8bc 100644 --- a/comfy/ldm/flux/math.py +++ b/comfy/ldm/flux/math.py @@ -7,7 +7,8 @@ import comfy.model_management def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor: - q, k = apply_rope(q, k, pe) + if pe is not None: + q, k = apply_rope(q, k, pe) heads = q.shape[1] x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask, transformer_options=transformer_options) return x From 24fdb92edf2e96fe757c480aa7f12be5bdfa3a15 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 19 Nov 2025 00:26:44 +0200 Subject: [PATCH 581/759] feat(api-nodes): add new Gemini model (#10789) --- comfy_api_nodes/apis/gemini_api.py | 231 +++++++++++++++++++++++++++-- comfy_api_nodes/nodes_gemini.py | 47 +++--- 2 files changed, 246 insertions(+), 32 deletions(-) diff --git a/comfy_api_nodes/apis/gemini_api.py b/comfy_api_nodes/apis/gemini_api.py index 2bf28bf93..f63e02693 100644 --- a/comfy_api_nodes/apis/gemini_api.py +++ b/comfy_api_nodes/apis/gemini_api.py @@ -1,22 +1,229 @@ -from typing import Optional +from datetime import date +from enum import Enum +from typing import Any -from comfy_api_nodes.apis import GeminiGenerationConfig, GeminiContent, GeminiSafetySetting, GeminiSystemInstructionContent, GeminiTool, GeminiVideoMetadata -from pydantic import BaseModel +from pydantic import BaseModel, Field + + +class GeminiSafetyCategory(str, Enum): + HARM_CATEGORY_SEXUALLY_EXPLICIT = "HARM_CATEGORY_SEXUALLY_EXPLICIT" + HARM_CATEGORY_HATE_SPEECH = "HARM_CATEGORY_HATE_SPEECH" + HARM_CATEGORY_HARASSMENT = "HARM_CATEGORY_HARASSMENT" + HARM_CATEGORY_DANGEROUS_CONTENT = "HARM_CATEGORY_DANGEROUS_CONTENT" + + +class GeminiSafetyThreshold(str, Enum): + OFF = "OFF" + BLOCK_NONE = "BLOCK_NONE" + BLOCK_LOW_AND_ABOVE = "BLOCK_LOW_AND_ABOVE" + BLOCK_MEDIUM_AND_ABOVE = "BLOCK_MEDIUM_AND_ABOVE" + BLOCK_ONLY_HIGH = "BLOCK_ONLY_HIGH" + + +class GeminiSafetySetting(BaseModel): + category: GeminiSafetyCategory + threshold: GeminiSafetyThreshold + + +class GeminiRole(str, Enum): + user = "user" + model = "model" + + +class GeminiMimeType(str, Enum): + application_pdf = "application/pdf" + audio_mpeg = "audio/mpeg" + audio_mp3 = "audio/mp3" + audio_wav = "audio/wav" + image_png = "image/png" + image_jpeg = "image/jpeg" + image_webp = "image/webp" + text_plain = "text/plain" + video_mov = "video/mov" + video_mpeg = "video/mpeg" + video_mp4 = "video/mp4" + video_mpg = "video/mpg" + video_avi = "video/avi" + video_wmv = "video/wmv" + video_mpegps = "video/mpegps" + video_flv = "video/flv" + + +class GeminiInlineData(BaseModel): + data: str | None = Field( + None, + description="The base64 encoding of the image, PDF, or video to include inline in the prompt. " + "When including media inline, you must also specify the media type (mimeType) of the data. Size limit: 20MB", + ) + mimeType: GeminiMimeType | None = Field(None) + + +class GeminiPart(BaseModel): + inlineData: GeminiInlineData | None = Field(None) + text: str | None = Field(None) + + +class GeminiTextPart(BaseModel): + text: str | None = Field(None) + + +class GeminiContent(BaseModel): + parts: list[GeminiPart] = Field(...) + role: GeminiRole = Field(..., examples=["user"]) + + +class GeminiSystemInstructionContent(BaseModel): + parts: list[GeminiTextPart] = Field( + ..., + description="A list of ordered parts that make up a single message. " + "Different parts may have different IANA MIME types.", + ) + role: GeminiRole = Field( + ..., + description="The identity of the entity that creates the message. " + "The following values are supported: " + "user: This indicates that the message is sent by a real person, typically a user-generated message. " + "model: This indicates that the message is generated by the model. " + "The model value is used to insert messages from model into the conversation during multi-turn conversations. " + "For non-multi-turn conversations, this field can be left blank or unset.", + ) + + +class GeminiFunctionDeclaration(BaseModel): + description: str | None = Field(None) + name: str = Field(...) + parameters: dict[str, Any] = Field(..., description="JSON schema for the function parameters") + + +class GeminiTool(BaseModel): + functionDeclarations: list[GeminiFunctionDeclaration] | None = Field(None) + + +class GeminiOffset(BaseModel): + nanos: int | None = Field(None, ge=0, le=999999999) + seconds: int | None = Field(None, ge=-315576000000, le=315576000000) + + +class GeminiVideoMetadata(BaseModel): + endOffset: GeminiOffset | None = Field(None) + startOffset: GeminiOffset | None = Field(None) + + +class GeminiGenerationConfig(BaseModel): + maxOutputTokens: int | None = Field(None, ge=16, le=8192) + seed: int | None = Field(None) + stopSequences: list[str] | None = Field(None) + temperature: float | None = Field(1, ge=0.0, le=2.0) + topK: int | None = Field(40, ge=1) + topP: float | None = Field(0.95, ge=0.0, le=1.0) class GeminiImageConfig(BaseModel): - aspectRatio: Optional[str] = None + aspectRatio: str | None = Field(None) + resolution: str | None = Field(None) class GeminiImageGenerationConfig(GeminiGenerationConfig): - responseModalities: Optional[list[str]] = None - imageConfig: Optional[GeminiImageConfig] = None + responseModalities: list[str] | None = Field(None) + imageConfig: GeminiImageConfig | None = Field(None) class GeminiImageGenerateContentRequest(BaseModel): - contents: list[GeminiContent] - generationConfig: Optional[GeminiImageGenerationConfig] = None - safetySettings: Optional[list[GeminiSafetySetting]] = None - systemInstruction: Optional[GeminiSystemInstructionContent] = None - tools: Optional[list[GeminiTool]] = None - videoMetadata: Optional[GeminiVideoMetadata] = None + contents: list[GeminiContent] = Field(...) + generationConfig: GeminiImageGenerationConfig | None = Field(None) + safetySettings: list[GeminiSafetySetting] | None = Field(None) + systemInstruction: GeminiSystemInstructionContent | None = Field(None) + tools: list[GeminiTool] | None = Field(None) + videoMetadata: GeminiVideoMetadata | None = Field(None) + + +class GeminiGenerateContentRequest(BaseModel): + contents: list[GeminiContent] = Field(...) + generationConfig: GeminiGenerationConfig | None = Field(None) + safetySettings: list[GeminiSafetySetting] | None = Field(None) + systemInstruction: GeminiSystemInstructionContent | None = Field(None) + tools: list[GeminiTool] | None = Field(None) + videoMetadata: GeminiVideoMetadata | None = Field(None) + + +class Modality(str, Enum): + MODALITY_UNSPECIFIED = "MODALITY_UNSPECIFIED" + TEXT = "TEXT" + IMAGE = "IMAGE" + VIDEO = "VIDEO" + AUDIO = "AUDIO" + DOCUMENT = "DOCUMENT" + + +class ModalityTokenCount(BaseModel): + modality: Modality | None = None + tokenCount: int | None = Field(None, description="Number of tokens for the given modality.") + + +class Probability(str, Enum): + NEGLIGIBLE = "NEGLIGIBLE" + LOW = "LOW" + MEDIUM = "MEDIUM" + HIGH = "HIGH" + UNKNOWN = "UNKNOWN" + + +class GeminiSafetyRating(BaseModel): + category: GeminiSafetyCategory | None = None + probability: Probability | None = Field( + None, + description="The probability that the content violates the specified safety category", + ) + + +class GeminiCitation(BaseModel): + authors: list[str] | None = None + endIndex: int | None = None + license: str | None = None + publicationDate: date | None = None + startIndex: int | None = None + title: str | None = None + uri: str | None = None + + +class GeminiCitationMetadata(BaseModel): + citations: list[GeminiCitation] | None = None + + +class GeminiCandidate(BaseModel): + citationMetadata: GeminiCitationMetadata | None = None + content: GeminiContent | None = None + finishReason: str | None = None + safetyRatings: list[GeminiSafetyRating] | None = None + + +class GeminiPromptFeedback(BaseModel): + blockReason: str | None = None + blockReasonMessage: str | None = None + safetyRatings: list[GeminiSafetyRating] | None = None + + +class GeminiUsageMetadata(BaseModel): + cachedContentTokenCount: int | None = Field( + None, + description="Output only. Number of tokens in the cached part in the input (the cached content).", + ) + candidatesTokenCount: int | None = Field(None, description="Number of tokens in the response(s).") + candidatesTokensDetails: list[ModalityTokenCount] | None = Field( + None, description="Breakdown of candidate tokens by modality." + ) + promptTokenCount: int | None = Field( + None, + description="Number of tokens in the request. When cachedContent is set, this is still the total effective prompt size meaning this includes the number of tokens in the cached content.", + ) + promptTokensDetails: list[ModalityTokenCount] | None = Field( + None, description="Breakdown of prompt tokens by modality." + ) + thoughtsTokenCount: int | None = Field(None, description="Number of tokens present in thoughts output.") + toolUsePromptTokenCount: int | None = Field(None, description="Number of tokens present in tool-use prompt(s).") + + +class GeminiGenerateContentResponse(BaseModel): + candidates: list[GeminiCandidate] | None = Field(None) + promptFeedback: GeminiPromptFeedback | None = Field(None) + usageMetadata: GeminiUsageMetadata | None = Field(None) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 67f2469ad..6e746eebd 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -3,8 +3,6 @@ API Nodes for Gemini Multimodal LLM Usage via Remote API See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference """ -from __future__ import annotations - import base64 import json import os @@ -12,7 +10,7 @@ import time import uuid from enum import Enum from io import BytesIO -from typing import Literal, Optional +from typing import Literal import torch from typing_extensions import override @@ -20,18 +18,17 @@ from typing_extensions import override import folder_paths from comfy_api.latest import IO, ComfyExtension, Input from comfy_api.util import VideoCodec, VideoContainer -from comfy_api_nodes.apis import ( +from comfy_api_nodes.apis.gemini_api import ( GeminiContent, GeminiGenerateContentRequest, GeminiGenerateContentResponse, - GeminiInlineData, - GeminiMimeType, - GeminiPart, -) -from comfy_api_nodes.apis.gemini_api import ( GeminiImageConfig, GeminiImageGenerateContentRequest, GeminiImageGenerationConfig, + GeminiInlineData, + GeminiMimeType, + GeminiPart, + GeminiRole, ) from comfy_api_nodes.util import ( ApiEndpoint, @@ -57,6 +54,7 @@ class GeminiModel(str, Enum): gemini_2_5_flash_preview_04_17 = "gemini-2.5-flash-preview-04-17" gemini_2_5_pro = "gemini-2.5-pro" gemini_2_5_flash = "gemini-2.5-flash" + gemini_3_0_pro = "gemini-3-pro-preview" class GeminiImageModel(str, Enum): @@ -103,6 +101,16 @@ def get_parts_by_type(response: GeminiGenerateContentResponse, part_type: Litera Returns: List of response parts matching the requested type. """ + if response.candidates is None: + if response.promptFeedback.blockReason: + feedback = response.promptFeedback + raise ValueError( + f"Gemini API blocked the request. Reason: {feedback.blockReason} ({feedback.blockReasonMessage})" + ) + raise NotImplementedError( + "Gemini returned no response candidates. " + "Please report to ComfyUI repository with the example of workflow to reproduce this." + ) parts = [] for part in response.candidates[0].content.parts: if part_type == "text" and hasattr(part, "text") and part.text: @@ -272,10 +280,10 @@ class GeminiNode(IO.ComfyNode): prompt: str, model: str, seed: int, - images: Optional[torch.Tensor] = None, - audio: Optional[Input.Audio] = None, - video: Optional[Input.Video] = None, - files: Optional[list[GeminiPart]] = None, + images: torch.Tensor | None = None, + audio: Input.Audio | None = None, + video: Input.Video | None = None, + files: list[GeminiPart] | None = None, ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) @@ -300,7 +308,7 @@ class GeminiNode(IO.ComfyNode): data=GeminiGenerateContentRequest( contents=[ GeminiContent( - role="user", + role=GeminiRole.user, parts=parts, ) ] @@ -308,7 +316,6 @@ class GeminiNode(IO.ComfyNode): response_model=GeminiGenerateContentResponse, ) - # Get result output output_text = get_text_from_response(response) if output_text: # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. @@ -406,7 +413,7 @@ class GeminiInputFiles(IO.ComfyNode): ) @classmethod - def execute(cls, file: str, GEMINI_INPUT_FILES: Optional[list[GeminiPart]] = None) -> IO.NodeOutput: + def execute(cls, file: str, GEMINI_INPUT_FILES: list[GeminiPart] | None = None) -> IO.NodeOutput: """Loads and formats input files for Gemini API.""" if GEMINI_INPUT_FILES is None: GEMINI_INPUT_FILES = [] @@ -421,7 +428,7 @@ class GeminiImage(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="GeminiImageNode", - display_name="Google Gemini Image", + display_name="Nano Banana (Google Gemini Image)", category="api node/image/Gemini", description="Edit images synchronously via Google API.", inputs=[ @@ -488,8 +495,8 @@ class GeminiImage(IO.ComfyNode): prompt: str, model: str, seed: int, - images: Optional[torch.Tensor] = None, - files: Optional[list[GeminiPart]] = None, + images: torch.Tensor | None = None, + files: list[GeminiPart] | None = None, aspect_ratio: str = "auto", ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) @@ -510,7 +517,7 @@ class GeminiImage(IO.ComfyNode): endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"), data=GeminiImageGenerateContentRequest( contents=[ - GeminiContent(role="user", parts=parts), + GeminiContent(role=GeminiRole.user, parts=parts), ], generationConfig=GeminiImageGenerationConfig( responseModalities=["TEXT", "IMAGE"], From b5c8be8b1db44ded07cb1b437b9f33ebff5848c1 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 18 Nov 2025 19:37:20 -0500 Subject: [PATCH 582/759] ComfyUI 0.3.70 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 1e554eb9f..9b77aabe9 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.69" +__version__ = "0.3.70" diff --git a/pyproject.toml b/pyproject.toml index a14b383b3..289b7145b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.69" +version = "0.3.70" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 17027f2a6a20a31e2c6f3be2b1a06f39ad3a68d9 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 18 Nov 2025 19:36:03 -0800 Subject: [PATCH 583/759] Add a way to disable the final norm in the llama based TE models. (#10794) --- comfy/text_encoders/llama.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index c050759fe..feb44bbb0 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -32,6 +32,7 @@ class Llama2Config: q_norm = None k_norm = None rope_scale = None + final_norm: bool = True @dataclass class Qwen25_3BConfig: @@ -53,6 +54,7 @@ class Qwen25_3BConfig: q_norm = None k_norm = None rope_scale = None + final_norm: bool = True @dataclass class Qwen25_7BVLI_Config: @@ -74,6 +76,7 @@ class Qwen25_7BVLI_Config: q_norm = None k_norm = None rope_scale = None + final_norm: bool = True @dataclass class Gemma2_2B_Config: @@ -96,6 +99,7 @@ class Gemma2_2B_Config: k_norm = None sliding_attention = None rope_scale = None + final_norm: bool = True @dataclass class Gemma3_4B_Config: @@ -118,6 +122,7 @@ class Gemma3_4B_Config: k_norm = "gemma3" sliding_attention = [False, False, False, False, False, 1024] rope_scale = [1.0, 8.0] + final_norm: bool = True class RMSNorm(nn.Module): def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None): @@ -366,7 +371,12 @@ class Llama2_(nn.Module): transformer(config, index=i, device=device, dtype=dtype, ops=ops) for i in range(config.num_hidden_layers) ]) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) + + if config.final_norm: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) + else: + self.norm = None + # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype) def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[]): @@ -421,14 +431,16 @@ class Llama2_(nn.Module): if i == intermediate_output: intermediate = x.clone() - x = self.norm(x) + if self.norm is not None: + x = self.norm(x) + if all_intermediate is not None: all_intermediate.append(x.unsqueeze(1).clone()) if all_intermediate is not None: intermediate = torch.cat(all_intermediate, dim=1) - if intermediate is not None and final_layer_norm_intermediate: + if intermediate is not None and final_layer_norm_intermediate and self.norm is not None: intermediate = self.norm(intermediate) return x, intermediate From 65ee24c9789b93660ebe978a3186486f105298c2 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 19 Nov 2025 11:25:28 +0200 Subject: [PATCH 584/759] change display name of PreviewAny node to "Preview as Text" (#10796) --- comfy_extras/nodes_preview_any.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_preview_any.py b/comfy_extras/nodes_preview_any.py index e749fa6ae..139b07c93 100644 --- a/comfy_extras/nodes_preview_any.py +++ b/comfy_extras/nodes_preview_any.py @@ -39,5 +39,5 @@ NODE_CLASS_MAPPINGS = { } NODE_DISPLAY_NAME_MAPPINGS = { - "PreviewAny": "Preview Any", + "PreviewAny": "Preview as Text", } From 6a1d3a1ae131f3fff7f45a7e835eb10e9d1338ee Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 20 Nov 2025 00:49:01 +0200 Subject: [PATCH 585/759] convert hunyuan3d.py to V3 schema (#10664) --- comfy_api/latest/__init__.py | 4 +- comfy_api/latest/_io.py | 5 +- comfy_api/latest/_util/__init__.py | 3 + comfy_api/latest/_util/geometry_types.py | 12 + comfy_extras/nodes_hunyuan3d.py | 274 +++++++++++++---------- 5 files changed, 178 insertions(+), 120 deletions(-) create mode 100644 comfy_api/latest/_util/geometry_types.py diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py index b7a3fa9c1..176ae36e0 100644 --- a/comfy_api/latest/__init__.py +++ b/comfy_api/latest/__init__.py @@ -7,7 +7,7 @@ from comfy_api.internal.singleton import ProxiedSingleton from comfy_api.internal.async_to_sync import create_sync_class from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents -from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents +from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL from . import _io as io from . import _ui as ui # from comfy_api.latest._resources import _RESOURCES as resources #noqa: F401 @@ -104,6 +104,8 @@ class Types: VideoCodec = VideoCodec VideoContainer = VideoContainer VideoComponents = VideoComponents + MESH = MESH + VOXEL = VOXEL ComfyAPI = ComfyAPI_latest diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 0b701260f..863254ce7 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -27,6 +27,7 @@ from comfy_api.internal import (_ComfyNodeInternal, _NodeOutputInternal, classpr prune_dict, shallow_clone_class) from comfy_api.latest._resources import Resources, ResourcesLocal from comfy_execution.graph_utils import ExecutionBlocker +from ._util import MESH, VOXEL # from comfy_extras.nodes_images import SVG as SVG_ # NOTE: needs to be moved before can be imported due to circular reference @@ -656,11 +657,11 @@ class LossMap(ComfyTypeIO): @comfytype(io_type="VOXEL") class Voxel(ComfyTypeIO): - Type = Any # TODO: VOXEL class is defined in comfy_extras/nodes_hunyuan3d.py; should be moved to somewhere else before referenced directly in v3 + Type = VOXEL @comfytype(io_type="MESH") class Mesh(ComfyTypeIO): - Type = Any # TODO: MESH class is defined in comfy_extras/nodes_hunyuan3d.py; should be moved to somewhere else before referenced directly in v3 + Type = MESH @comfytype(io_type="HOOKS") class Hooks(ComfyTypeIO): diff --git a/comfy_api/latest/_util/__init__.py b/comfy_api/latest/_util/__init__.py index 9019c46db..fc5431dda 100644 --- a/comfy_api/latest/_util/__init__.py +++ b/comfy_api/latest/_util/__init__.py @@ -1,8 +1,11 @@ from .video_types import VideoContainer, VideoCodec, VideoComponents +from .geometry_types import VOXEL, MESH __all__ = [ # Utility Types "VideoContainer", "VideoCodec", "VideoComponents", + "VOXEL", + "MESH", ] diff --git a/comfy_api/latest/_util/geometry_types.py b/comfy_api/latest/_util/geometry_types.py new file mode 100644 index 000000000..385122778 --- /dev/null +++ b/comfy_api/latest/_util/geometry_types.py @@ -0,0 +1,12 @@ +import torch + + +class VOXEL: + def __init__(self, data: torch.Tensor): + self.data = data + + +class MESH: + def __init__(self, vertices: torch.Tensor, faces: torch.Tensor): + self.vertices = vertices + self.faces = faces diff --git a/comfy_extras/nodes_hunyuan3d.py b/comfy_extras/nodes_hunyuan3d.py index f6e71e0a8..adca14f62 100644 --- a/comfy_extras/nodes_hunyuan3d.py +++ b/comfy_extras/nodes_hunyuan3d.py @@ -7,63 +7,79 @@ from comfy.ldm.modules.diffusionmodules.mmdit import get_1d_sincos_pos_embed_fro import folder_paths import comfy.model_management from comfy.cli_args import args +from typing_extensions import override +from comfy_api.latest import ComfyExtension, IO, Types +from comfy_api.latest._util import MESH, VOXEL # only for backward compatibility if someone import it from this file (will be removed later) # noqa -class EmptyLatentHunyuan3Dv2: + +class EmptyLatentHunyuan3Dv2(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "resolution": ("INT", {"default": 3072, "min": 1, "max": 8192}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent images in the batch."}), - } - } + def define_schema(cls): + return IO.Schema( + node_id="EmptyLatentHunyuan3Dv2", + category="latent/3d", + inputs=[ + IO.Int.Input("resolution", default=3072, min=1, max=8192), + IO.Int.Input("batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."), + ], + outputs=[ + IO.Latent.Output(), + ] + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" - - CATEGORY = "latent/3d" - - def generate(self, resolution, batch_size): + @classmethod + def execute(cls, resolution, batch_size) -> IO.NodeOutput: latent = torch.zeros([batch_size, 64, resolution], device=comfy.model_management.intermediate_device()) - return ({"samples": latent, "type": "hunyuan3dv2"}, ) + return IO.NodeOutput({"samples": latent, "type": "hunyuan3dv2"}) -class Hunyuan3Dv2Conditioning: + generate = execute # TODO: remove + + +class Hunyuan3Dv2Conditioning(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"clip_vision_output": ("CLIP_VISION_OUTPUT",), - }} + def define_schema(cls): + return IO.Schema( + node_id="Hunyuan3Dv2Conditioning", + category="conditioning/video_models", + inputs=[ + IO.ClipVisionOutput.Input("clip_vision_output"), + ], + outputs=[ + IO.Conditioning.Output(display_name="positive"), + IO.Conditioning.Output(display_name="negative"), + ] + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING") - RETURN_NAMES = ("positive", "negative") - - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, clip_vision_output): + @classmethod + def execute(cls, clip_vision_output) -> IO.NodeOutput: embeds = clip_vision_output.last_hidden_state positive = [[embeds, {}]] negative = [[torch.zeros_like(embeds), {}]] - return (positive, negative) + return IO.NodeOutput(positive, negative) + + encode = execute # TODO: remove -class Hunyuan3Dv2ConditioningMultiView: +class Hunyuan3Dv2ConditioningMultiView(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {}, - "optional": {"front": ("CLIP_VISION_OUTPUT",), - "left": ("CLIP_VISION_OUTPUT",), - "back": ("CLIP_VISION_OUTPUT",), - "right": ("CLIP_VISION_OUTPUT",), }} + def define_schema(cls): + return IO.Schema( + node_id="Hunyuan3Dv2ConditioningMultiView", + category="conditioning/video_models", + inputs=[ + IO.ClipVisionOutput.Input("front", optional=True), + IO.ClipVisionOutput.Input("left", optional=True), + IO.ClipVisionOutput.Input("back", optional=True), + IO.ClipVisionOutput.Input("right", optional=True), + ], + outputs=[ + IO.Conditioning.Output(display_name="positive"), + IO.Conditioning.Output(display_name="negative"), + ] + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING") - RETURN_NAMES = ("positive", "negative") - - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, front=None, left=None, back=None, right=None): + @classmethod + def execute(cls, front=None, left=None, back=None, right=None) -> IO.NodeOutput: all_embeds = [front, left, back, right] out = [] pos_embeds = None @@ -76,29 +92,35 @@ class Hunyuan3Dv2ConditioningMultiView: embeds = torch.cat(out, dim=1) positive = [[embeds, {}]] negative = [[torch.zeros_like(embeds), {}]] - return (positive, negative) + return IO.NodeOutput(positive, negative) + + encode = execute # TODO: remove -class VOXEL: - def __init__(self, data): - self.data = data - -class VAEDecodeHunyuan3D: +class VAEDecodeHunyuan3D(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"samples": ("LATENT", ), - "vae": ("VAE", ), - "num_chunks": ("INT", {"default": 8000, "min": 1000, "max": 500000}), - "octree_resolution": ("INT", {"default": 256, "min": 16, "max": 512}), - }} - RETURN_TYPES = ("VOXEL",) - FUNCTION = "decode" + def define_schema(cls): + return IO.Schema( + node_id="VAEDecodeHunyuan3D", + category="latent/3d", + inputs=[ + IO.Latent.Input("samples"), + IO.Vae.Input("vae"), + IO.Int.Input("num_chunks", default=8000, min=1000, max=500000), + IO.Int.Input("octree_resolution", default=256, min=16, max=512), + ], + outputs=[ + IO.Voxel.Output(), + ] + ) - CATEGORY = "latent/3d" + @classmethod + def execute(cls, vae, samples, num_chunks, octree_resolution) -> IO.NodeOutput: + voxels = Types.VOXEL(vae.decode(samples["samples"], vae_options={"num_chunks": num_chunks, "octree_resolution": octree_resolution})) + return IO.NodeOutput(voxels) + + decode = execute # TODO: remove - def decode(self, vae, samples, num_chunks, octree_resolution): - voxels = VOXEL(vae.decode(samples["samples"], vae_options={"num_chunks": num_chunks, "octree_resolution": octree_resolution})) - return (voxels, ) def voxel_to_mesh(voxels, threshold=0.5, device=None): if device is None: @@ -396,24 +418,24 @@ def voxel_to_mesh_surfnet(voxels, threshold=0.5, device=None): return final_vertices, faces -class MESH: - def __init__(self, vertices, faces): - self.vertices = vertices - self.faces = faces - -class VoxelToMeshBasic: +class VoxelToMeshBasic(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"voxel": ("VOXEL", ), - "threshold": ("FLOAT", {"default": 0.6, "min": -1.0, "max": 1.0, "step": 0.01}), - }} - RETURN_TYPES = ("MESH",) - FUNCTION = "decode" + def define_schema(cls): + return IO.Schema( + node_id="VoxelToMeshBasic", + category="3d", + inputs=[ + IO.Voxel.Input("voxel"), + IO.Float.Input("threshold", default=0.6, min=-1.0, max=1.0, step=0.01), + ], + outputs=[ + IO.Mesh.Output(), + ] + ) - CATEGORY = "3d" - - def decode(self, voxel, threshold): + @classmethod + def execute(cls, voxel, threshold) -> IO.NodeOutput: vertices = [] faces = [] for x in voxel.data: @@ -421,21 +443,29 @@ class VoxelToMeshBasic: vertices.append(v) faces.append(f) - return (MESH(torch.stack(vertices), torch.stack(faces)), ) + return IO.NodeOutput(Types.MESH(torch.stack(vertices), torch.stack(faces))) -class VoxelToMesh: + decode = execute # TODO: remove + + +class VoxelToMesh(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"voxel": ("VOXEL", ), - "algorithm": (["surface net", "basic"], ), - "threshold": ("FLOAT", {"default": 0.6, "min": -1.0, "max": 1.0, "step": 0.01}), - }} - RETURN_TYPES = ("MESH",) - FUNCTION = "decode" + def define_schema(cls): + return IO.Schema( + node_id="VoxelToMesh", + category="3d", + inputs=[ + IO.Voxel.Input("voxel"), + IO.Combo.Input("algorithm", options=["surface net", "basic"]), + IO.Float.Input("threshold", default=0.6, min=-1.0, max=1.0, step=0.01), + ], + outputs=[ + IO.Mesh.Output(), + ] + ) - CATEGORY = "3d" - - def decode(self, voxel, algorithm, threshold): + @classmethod + def execute(cls, voxel, algorithm, threshold) -> IO.NodeOutput: vertices = [] faces = [] @@ -449,7 +479,9 @@ class VoxelToMesh: vertices.append(v) faces.append(f) - return (MESH(torch.stack(vertices), torch.stack(faces)), ) + return IO.NodeOutput(Types.MESH(torch.stack(vertices), torch.stack(faces))) + + decode = execute # TODO: remove def save_glb(vertices, faces, filepath, metadata=None): @@ -581,31 +613,32 @@ def save_glb(vertices, faces, filepath, metadata=None): return filepath -class SaveGLB: +class SaveGLB(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"mesh": ("MESH", ), - "filename_prefix": ("STRING", {"default": "mesh/ComfyUI"}), }, - "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, } + def define_schema(cls): + return IO.Schema( + node_id="SaveGLB", + category="3d", + is_output_node=True, + inputs=[ + IO.Mesh.Input("mesh"), + IO.String.Input("filename_prefix", default="mesh/ComfyUI"), + ], + hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo] + ) - RETURN_TYPES = () - FUNCTION = "save" - - OUTPUT_NODE = True - - CATEGORY = "3d" - - def save(self, mesh, filename_prefix, prompt=None, extra_pnginfo=None): + @classmethod + def execute(cls, mesh, filename_prefix) -> IO.NodeOutput: full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, folder_paths.get_output_directory()) results = [] metadata = {} if not args.disable_metadata: - if prompt is not None: - metadata["prompt"] = json.dumps(prompt) - if extra_pnginfo is not None: - for x in extra_pnginfo: - metadata[x] = json.dumps(extra_pnginfo[x]) + if cls.hidden.prompt is not None: + metadata["prompt"] = json.dumps(cls.hidden.prompt) + if cls.hidden.extra_pnginfo is not None: + for x in cls.hidden.extra_pnginfo: + metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x]) for i in range(mesh.vertices.shape[0]): f = f"{filename}_{counter:05}_.glb" @@ -616,15 +649,22 @@ class SaveGLB: "type": "output" }) counter += 1 - return {"ui": {"3d": results}} + return IO.NodeOutput(ui={"3d": results}) -NODE_CLASS_MAPPINGS = { - "EmptyLatentHunyuan3Dv2": EmptyLatentHunyuan3Dv2, - "Hunyuan3Dv2Conditioning": Hunyuan3Dv2Conditioning, - "Hunyuan3Dv2ConditioningMultiView": Hunyuan3Dv2ConditioningMultiView, - "VAEDecodeHunyuan3D": VAEDecodeHunyuan3D, - "VoxelToMeshBasic": VoxelToMeshBasic, - "VoxelToMesh": VoxelToMesh, - "SaveGLB": SaveGLB, -} +class Hunyuan3dExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + EmptyLatentHunyuan3Dv2, + Hunyuan3Dv2Conditioning, + Hunyuan3Dv2ConditioningMultiView, + VAEDecodeHunyuan3D, + VoxelToMeshBasic, + VoxelToMesh, + SaveGLB, + ] + + +async def comfy_entrypoint() -> Hunyuan3dExtension: + return Hunyuan3dExtension() From 7601e89255cde24667d3b4e6022f1385d901748b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:17:15 -0800 Subject: [PATCH 586/759] Fix workflow name. (#10806) --- .github/workflows/release-stable-all.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-stable-all.yml b/.github/workflows/release-stable-all.yml index f7de3a7c3..9274b4170 100644 --- a/.github/workflows/release-stable-all.yml +++ b/.github/workflows/release-stable-all.yml @@ -14,7 +14,7 @@ jobs: contents: "write" packages: "write" pull-requests: "read" - name: "Release NVIDIA Default (cu129)" + name: "Release NVIDIA Default (cu130)" uses: ./.github/workflows/stable-release.yml with: git_tag: ${{ inputs.git_tag }} From 394348f5caaa062eac11a57e2997aacccd4246eb Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 20 Nov 2025 03:44:04 +0200 Subject: [PATCH 587/759] feat(api-nodes): add Topaz API nodes (#10755) --- comfy_api_nodes/apis/topaz_api.py | 133 ++++++++++ comfy_api_nodes/nodes_topaz.py | 421 ++++++++++++++++++++++++++++++ comfy_api_nodes/util/client.py | 9 +- nodes.py | 1 + 4 files changed, 560 insertions(+), 4 deletions(-) create mode 100644 comfy_api_nodes/apis/topaz_api.py create mode 100644 comfy_api_nodes/nodes_topaz.py diff --git a/comfy_api_nodes/apis/topaz_api.py b/comfy_api_nodes/apis/topaz_api.py new file mode 100644 index 000000000..4d9e62e72 --- /dev/null +++ b/comfy_api_nodes/apis/topaz_api.py @@ -0,0 +1,133 @@ +from typing import Optional, Union + +from pydantic import BaseModel, Field + + +class ImageEnhanceRequest(BaseModel): + model: str = Field("Reimagine") + output_format: str = Field("jpeg") + subject_detection: str = Field("All") + face_enhancement: bool = Field(True) + face_enhancement_creativity: float = Field(0, description="Is ignored if face_enhancement is false") + face_enhancement_strength: float = Field(0.8, description="Is ignored if face_enhancement is false") + source_url: str = Field(...) + output_width: Optional[int] = Field(None) + output_height: Optional[int] = Field(None) + crop_to_fill: bool = Field(False) + prompt: Optional[str] = Field(None, description="Text prompt for creative upscaling guidance") + creativity: int = Field(3, description="Creativity settings range from 1 to 9") + face_preservation: str = Field("true", description="To preserve the identity of characters") + color_preservation: str = Field("true", description="To preserve the original color") + + +class ImageAsyncTaskResponse(BaseModel): + process_id: str = Field(...) + + +class ImageStatusResponse(BaseModel): + process_id: str = Field(...) + status: str = Field(...) + progress: Optional[int] = Field(None) + credits: int = Field(...) + + +class ImageDownloadResponse(BaseModel): + download_url: str = Field(...) + expiry: int = Field(...) + + +class Resolution(BaseModel): + width: int = Field(...) + height: int = Field(...) + + +class CreateCreateVideoRequestSource(BaseModel): + container: str = Field(...) + size: int = Field(..., description="Size of the video file in bytes") + duration: int = Field(..., description="Duration of the video file in seconds") + frameCount: int = Field(..., description="Total number of frames in the video") + frameRate: int = Field(...) + resolution: Resolution = Field(...) + + +class VideoFrameInterpolationFilter(BaseModel): + model: str = Field(...) + slowmo: Optional[int] = Field(None) + fps: int = Field(...) + duplicate: bool = Field(...) + duplicate_threshold: float = Field(...) + + +class VideoEnhancementFilter(BaseModel): + model: str = Field(...) + auto: Optional[str] = Field(None, description="Auto, Manual, Relative") + focusFixLevel: Optional[str] = Field(None, description="Downscales video input for correction of blurred subjects") + compression: Optional[float] = Field(None, description="Strength of compression recovery") + details: Optional[float] = Field(None, description="Amount of detail reconstruction") + prenoise: Optional[float] = Field(None, description="Amount of noise to add to input to reduce over-smoothing") + noise: Optional[float] = Field(None, description="Amount of noise reduction") + halo: Optional[float] = Field(None, description="Amount of halo reduction") + preblur: Optional[float] = Field(None, description="Anti-aliasing and deblurring strength") + blur: Optional[float] = Field(None, description="Amount of sharpness applied") + grain: Optional[float] = Field(None, description="Grain after AI model processing") + grainSize: Optional[float] = Field(None, description="Size of generated grain") + recoverOriginalDetailValue: Optional[float] = Field(None, description="Source details into the output video") + creativity: Optional[str] = Field(None, description="Creativity level(high, low) for slc-1 only") + isOptimizedMode: Optional[bool] = Field(None, description="Set to true for Starlight Creative (slc-1) only") + + +class OutputInformationVideo(BaseModel): + resolution: Resolution = Field(...) + frameRate: int = Field(...) + audioCodec: Optional[str] = Field(..., description="Required if audioTransfer is Copy or Convert") + audioTransfer: str = Field(..., description="Copy, Convert, None") + dynamicCompressionLevel: str = Field(..., description="Low, Mid, High") + + +class Overrides(BaseModel): + isPaidDiffusion: bool = Field(True) + + +class CreateVideoRequest(BaseModel): + source: CreateCreateVideoRequestSource = Field(...) + filters: list[Union[VideoFrameInterpolationFilter, VideoEnhancementFilter]] = Field(...) + output: OutputInformationVideo = Field(...) + overrides: Overrides = Field(Overrides(isPaidDiffusion=True)) + + +class CreateVideoResponse(BaseModel): + requestId: str = Field(...) + + +class VideoAcceptResponse(BaseModel): + uploadId: str = Field(...) + urls: list[str] = Field(...) + + +class VideoCompleteUploadRequestPart(BaseModel): + partNum: int = Field(...) + eTag: str = Field(...) + + +class VideoCompleteUploadRequest(BaseModel): + uploadResults: list[VideoCompleteUploadRequestPart] = Field(...) + + +class VideoCompleteUploadResponse(BaseModel): + message: str = Field(..., description="Confirmation message") + + +class VideoStatusResponseEstimates(BaseModel): + cost: list[int] = Field(...) + + +class VideoStatusResponseDownloadUrl(BaseModel): + url: str = Field(...) + + +class VideoStatusResponse(BaseModel): + status: str = Field(...) + estimates: Optional[VideoStatusResponseEstimates] = Field(None) + progress: Optional[float] = Field(None) + message: Optional[str] = Field("") + download: Optional[VideoStatusResponseDownloadUrl] = Field(None) diff --git a/comfy_api_nodes/nodes_topaz.py b/comfy_api_nodes/nodes_topaz.py new file mode 100644 index 000000000..79c7bf43d --- /dev/null +++ b/comfy_api_nodes/nodes_topaz.py @@ -0,0 +1,421 @@ +import builtins +from io import BytesIO + +import aiohttp +import torch +from typing_extensions import override + +from comfy_api.input.video_types import VideoInput +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.apis import topaz_api +from comfy_api_nodes.util import ( + ApiEndpoint, + download_url_to_image_tensor, + download_url_to_video_output, + get_fs_object_size, + get_number_of_images, + poll_op, + sync_op, + upload_images_to_comfyapi, + validate_container_format_is_mp4, +) + +UPSCALER_MODELS_MAP = { + "Starlight (Astra) Fast": "slf-1", + "Starlight (Astra) Creative": "slc-1", +} +UPSCALER_VALUES_MAP = { + "FullHD (1080p)": 1920, + "4K (2160p)": 3840, +} + + +class TopazImageEnhance(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="TopazImageEnhance", + display_name="Topaz Image Enhance", + category="api node/image/Topaz", + description="Industry-standard upscaling and image enhancement.", + inputs=[ + IO.Combo.Input("model", options=["Reimagine"]), + IO.Image.Input("image"), + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Optional text prompt for creative upscaling guidance.", + optional=True, + ), + IO.Combo.Input( + "subject_detection", + options=["All", "Foreground", "Background"], + optional=True, + ), + IO.Boolean.Input( + "face_enhancement", + default=True, + optional=True, + tooltip="Enhance faces (if present) during processing.", + ), + IO.Float.Input( + "face_enhancement_creativity", + default=0.0, + min=0.0, + max=1.0, + step=0.01, + display_mode=IO.NumberDisplay.number, + optional=True, + tooltip="Set the creativity level for face enhancement.", + ), + IO.Float.Input( + "face_enhancement_strength", + default=1.0, + min=0.0, + max=1.0, + step=0.01, + display_mode=IO.NumberDisplay.number, + optional=True, + tooltip="Controls how sharp enhanced faces are relative to the background.", + ), + IO.Boolean.Input( + "crop_to_fill", + default=False, + optional=True, + tooltip="By default, the image is letterboxed when the output aspect ratio differs. " + "Enable to crop the image to fill the output dimensions.", + ), + IO.Int.Input( + "output_width", + default=0, + min=0, + max=32000, + step=1, + display_mode=IO.NumberDisplay.number, + optional=True, + tooltip="Zero value means to calculate automatically (usually it will be original size or output_height if specified).", + ), + IO.Int.Input( + "output_height", + default=0, + min=0, + max=32000, + step=1, + display_mode=IO.NumberDisplay.number, + optional=True, + tooltip="Zero value means to output in the same height as original or output width.", + ), + IO.Int.Input( + "creativity", + default=3, + min=1, + max=9, + step=1, + display_mode=IO.NumberDisplay.slider, + optional=True, + ), + IO.Boolean.Input( + "face_preservation", + default=True, + optional=True, + tooltip="Preserve subjects' facial identity.", + ), + IO.Boolean.Input( + "color_preservation", + default=True, + optional=True, + tooltip="Preserve the original colors.", + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + image: torch.Tensor, + prompt: str = "", + subject_detection: str = "All", + face_enhancement: bool = True, + face_enhancement_creativity: float = 1.0, + face_enhancement_strength: float = 0.8, + crop_to_fill: bool = False, + output_width: int = 0, + output_height: int = 0, + creativity: int = 3, + face_preservation: bool = True, + color_preservation: bool = True, + ) -> IO.NodeOutput: + if get_number_of_images(image) != 1: + raise ValueError("Only one input image is supported.") + download_url = await upload_images_to_comfyapi(cls, image, max_images=1, mime_type="image/png") + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/topaz/image/v1/enhance-gen/async", method="POST"), + response_model=topaz_api.ImageAsyncTaskResponse, + data=topaz_api.ImageEnhanceRequest( + model=model, + prompt=prompt, + subject_detection=subject_detection, + face_enhancement=face_enhancement, + face_enhancement_creativity=face_enhancement_creativity, + face_enhancement_strength=face_enhancement_strength, + crop_to_fill=crop_to_fill, + output_width=output_width if output_width else None, + output_height=output_height if output_height else None, + creativity=creativity, + face_preservation=str(face_preservation).lower(), + color_preservation=str(color_preservation).lower(), + source_url=download_url[0], + output_format="png", + ), + content_type="multipart/form-data", + ) + + await poll_op( + cls, + poll_endpoint=ApiEndpoint(path=f"/proxy/topaz/image/v1/status/{initial_response.process_id}"), + response_model=topaz_api.ImageStatusResponse, + status_extractor=lambda x: x.status, + progress_extractor=lambda x: getattr(x, "progress", 0), + price_extractor=lambda x: x.credits * 0.08, + poll_interval=8.0, + max_poll_attempts=160, + estimated_duration=60, + ) + + results = await sync_op( + cls, + ApiEndpoint(path=f"/proxy/topaz/image/v1/download/{initial_response.process_id}"), + response_model=topaz_api.ImageDownloadResponse, + monitor_progress=False, + ) + return IO.NodeOutput(await download_url_to_image_tensor(results.download_url)) + + +class TopazVideoEnhance(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="TopazVideoEnhance", + display_name="Topaz Video Enhance", + category="api node/video/Topaz", + description="Breathe new life into video with powerful upscaling and recovery technology.", + inputs=[ + IO.Video.Input("video"), + IO.Boolean.Input("upscaler_enabled", default=True), + IO.Combo.Input("upscaler_model", options=list(UPSCALER_MODELS_MAP.keys())), + IO.Combo.Input("upscaler_resolution", options=list(UPSCALER_VALUES_MAP.keys())), + IO.Combo.Input( + "upscaler_creativity", + options=["low", "middle", "high"], + default="low", + tooltip="Creativity level (applies only to Starlight (Astra) Creative).", + optional=True, + ), + IO.Boolean.Input("interpolation_enabled", default=False, optional=True), + IO.Combo.Input("interpolation_model", options=["apo-8"], default="apo-8", optional=True), + IO.Int.Input( + "interpolation_slowmo", + default=1, + min=1, + max=16, + display_mode=IO.NumberDisplay.number, + tooltip="Slow-motion factor applied to the input video. " + "For example, 2 makes the output twice as slow and doubles the duration.", + optional=True, + ), + IO.Int.Input( + "interpolation_frame_rate", + default=60, + min=15, + max=240, + display_mode=IO.NumberDisplay.number, + tooltip="Output frame rate.", + optional=True, + ), + IO.Boolean.Input( + "interpolation_duplicate", + default=False, + tooltip="Analyze the input for duplicate frames and remove them.", + optional=True, + ), + IO.Float.Input( + "interpolation_duplicate_threshold", + default=0.01, + min=0.001, + max=0.1, + step=0.001, + display_mode=IO.NumberDisplay.number, + tooltip="Detection sensitivity for duplicate frames.", + optional=True, + ), + IO.Combo.Input( + "dynamic_compression_level", + options=["Low", "Mid", "High"], + default="Low", + tooltip="CQP level.", + optional=True, + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + video: VideoInput, + upscaler_enabled: bool, + upscaler_model: str, + upscaler_resolution: str, + upscaler_creativity: str = "low", + interpolation_enabled: bool = False, + interpolation_model: str = "apo-8", + interpolation_slowmo: int = 1, + interpolation_frame_rate: int = 60, + interpolation_duplicate: bool = False, + interpolation_duplicate_threshold: float = 0.01, + dynamic_compression_level: str = "Low", + ) -> IO.NodeOutput: + if upscaler_enabled is False and interpolation_enabled is False: + raise ValueError("There is nothing to do: both upscaling and interpolation are disabled.") + src_width, src_height = video.get_dimensions() + video_components = video.get_components() + src_frame_rate = int(video_components.frame_rate) + duration_sec = video.get_duration() + estimated_frames = int(duration_sec * src_frame_rate) + validate_container_format_is_mp4(video) + src_video_stream = video.get_stream_source() + target_width = src_width + target_height = src_height + target_frame_rate = src_frame_rate + filters = [] + if upscaler_enabled: + target_width = UPSCALER_VALUES_MAP[upscaler_resolution] + target_height = UPSCALER_VALUES_MAP[upscaler_resolution] + filters.append( + topaz_api.VideoEnhancementFilter( + model=UPSCALER_MODELS_MAP[upscaler_model], + creativity=(upscaler_creativity if UPSCALER_MODELS_MAP[upscaler_model] == "slc-1" else None), + isOptimizedMode=(True if UPSCALER_MODELS_MAP[upscaler_model] == "slc-1" else None), + ), + ) + if interpolation_enabled: + target_frame_rate = interpolation_frame_rate + filters.append( + topaz_api.VideoFrameInterpolationFilter( + model=interpolation_model, + slowmo=interpolation_slowmo, + fps=interpolation_frame_rate, + duplicate=interpolation_duplicate, + duplicate_threshold=interpolation_duplicate_threshold, + ), + ) + initial_res = await sync_op( + cls, + ApiEndpoint(path="/proxy/topaz/video/", method="POST"), + response_model=topaz_api.CreateVideoResponse, + data=topaz_api.CreateVideoRequest( + source=topaz_api.CreateCreateVideoRequestSource( + container="mp4", + size=get_fs_object_size(src_video_stream), + duration=int(duration_sec), + frameCount=estimated_frames, + frameRate=src_frame_rate, + resolution=topaz_api.Resolution(width=src_width, height=src_height), + ), + filters=filters, + output=topaz_api.OutputInformationVideo( + resolution=topaz_api.Resolution(width=target_width, height=target_height), + frameRate=target_frame_rate, + audioCodec="AAC", + audioTransfer="Copy", + dynamicCompressionLevel=dynamic_compression_level, + ), + ), + wait_label="Creating task", + final_label_on_success="Task created", + ) + upload_res = await sync_op( + cls, + ApiEndpoint( + path=f"/proxy/topaz/video/{initial_res.requestId}/accept", + method="PATCH", + ), + response_model=topaz_api.VideoAcceptResponse, + wait_label="Preparing upload", + final_label_on_success="Upload started", + ) + if len(upload_res.urls) > 1: + raise NotImplementedError( + "Large files are not currently supported. Please open an issue in the ComfyUI repository." + ) + async with aiohttp.ClientSession(headers={"Content-Type": "video/mp4"}) as session: + if isinstance(src_video_stream, BytesIO): + src_video_stream.seek(0) + async with session.put(upload_res.urls[0], data=src_video_stream, raise_for_status=True) as res: + upload_etag = res.headers["Etag"] + else: + with builtins.open(src_video_stream, "rb") as video_file: + async with session.put(upload_res.urls[0], data=video_file, raise_for_status=True) as res: + upload_etag = res.headers["Etag"] + await sync_op( + cls, + ApiEndpoint( + path=f"/proxy/topaz/video/{initial_res.requestId}/complete-upload", + method="PATCH", + ), + response_model=topaz_api.VideoCompleteUploadResponse, + data=topaz_api.VideoCompleteUploadRequest( + uploadResults=[ + topaz_api.VideoCompleteUploadRequestPart( + partNum=1, + eTag=upload_etag, + ), + ], + ), + wait_label="Finalizing upload", + final_label_on_success="Upload completed", + ) + final_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/topaz/video/{initial_res.requestId}/status"), + response_model=topaz_api.VideoStatusResponse, + status_extractor=lambda x: x.status, + progress_extractor=lambda x: getattr(x, "progress", 0), + price_extractor=lambda x: (x.estimates.cost[0] * 0.08 if x.estimates and x.estimates.cost[0] else None), + poll_interval=10.0, + max_poll_attempts=320, + ) + return IO.NodeOutput(await download_url_to_video_output(final_response.download.url)) + + +class TopazExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + TopazImageEnhance, + TopazVideoEnhance, + ] + + +async def comfy_entrypoint() -> TopazExtension: + return TopazExtension() diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py index 2d5dcd648..ad6e3c0d0 100644 --- a/comfy_api_nodes/util/client.py +++ b/comfy_api_nodes/util/client.py @@ -77,9 +77,9 @@ class _PollUIState: _RETRY_STATUS = {408, 429, 500, 502, 503, 504} -COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done"] -FAILED_STATUSES = ["cancelled", "canceled", "fail", "failed", "error"] -QUEUED_STATUSES = ["created", "queued", "queueing", "submitted"] +COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done", "complete"] +FAILED_STATUSES = ["cancelled", "canceled", "canceling", "fail", "failed", "error"] +QUEUED_STATUSES = ["created", "queued", "queueing", "submitted", "initializing"] async def sync_op( @@ -424,7 +424,8 @@ def _display_text( if status: display_lines.append(f"Status: {status.capitalize() if isinstance(status, str) else status}") if price is not None: - display_lines.append(f"Price: ${float(price):,.4f}") + p = f"{float(price):,.4f}".rstrip("0").rstrip(".") + display_lines.append(f"Price: ${p}") if text is not None: display_lines.append(text) if display_lines: diff --git a/nodes.py b/nodes.py index f6aeedc78..ac14e39a7 100644 --- a/nodes.py +++ b/nodes.py @@ -2359,6 +2359,7 @@ async def init_builtin_api_nodes(): "nodes_pika.py", "nodes_runway.py", "nodes_sora.py", + "nodes_topaz.py", "nodes_tripo.py", "nodes_moonvalley.py", "nodes_rodin.py", From cb96d4d18c78ee09d5fd70954ffcb4ad2c7f0d7a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 19 Nov 2025 20:56:23 -0800 Subject: [PATCH 588/759] Disable workaround on newer cudnn. (#10807) --- comfy/ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comfy/ops.py b/comfy/ops.py index 2a90a5ba2..640622fd1 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -58,7 +58,8 @@ except (ModuleNotFoundError, TypeError): NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False try: if comfy.model_management.is_nvidia(): - if torch.backends.cudnn.version() >= 91002 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10): + cudnn_version = torch.backends.cudnn.version() + if (cudnn_version >= 91002 and cudnn_version < 91500) and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10): #TODO: change upper bound version once it's fixed' NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True logging.info("working around nvidia conv3d memory bug.") From 87b0359392219841c2214e1eb06678840cae470e Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Wed, 19 Nov 2025 22:36:56 -0800 Subject: [PATCH 589/759] Update server templates handler to use new multi-package distribution (comfyui-workflow-templates versions >=0.3) (#10791) * update templates for monorepo * refactor --- app/frontend_management.py | 67 ++++++++++++++++++++++++++++++++++++-- requirements.txt | 2 +- server.py | 32 ++++++++++++++---- 3 files changed, 92 insertions(+), 9 deletions(-) diff --git a/app/frontend_management.py b/app/frontend_management.py index cce0c117d..bdaa85812 100644 --- a/app/frontend_management.py +++ b/app/frontend_management.py @@ -10,7 +10,8 @@ import importlib from dataclasses import dataclass from functools import cached_property from pathlib import Path -from typing import TypedDict, Optional +from typing import Dict, TypedDict, Optional +from aiohttp import web from importlib.metadata import version import requests @@ -257,7 +258,54 @@ comfyui-frontend-package is not installed. sys.exit(-1) @classmethod - def templates_path(cls) -> str: + def template_asset_map(cls) -> Optional[Dict[str, str]]: + """Return a mapping of template asset names to their absolute paths.""" + try: + from comfyui_workflow_templates import ( + get_asset_path, + iter_templates, + ) + except ImportError: + logging.error( + f""" +********** ERROR *********** + +comfyui-workflow-templates is not installed. + +{frontend_install_warning_message()} + +********** ERROR *********** +""".strip() + ) + return None + + try: + template_entries = list(iter_templates()) + except Exception as exc: + logging.error(f"Failed to enumerate workflow templates: {exc}") + return None + + asset_map: Dict[str, str] = {} + try: + for entry in template_entries: + for asset in entry.assets: + asset_map[asset.filename] = get_asset_path( + entry.template_id, asset.filename + ) + except Exception as exc: + logging.error(f"Failed to resolve template asset paths: {exc}") + return None + + if not asset_map: + logging.error("No workflow template assets found. Did the packages install correctly?") + return None + + return asset_map + + + @classmethod + def legacy_templates_path(cls) -> Optional[str]: + """Return the legacy templates directory shipped inside the meta package.""" try: import comfyui_workflow_templates @@ -276,6 +324,7 @@ comfyui-workflow-templates is not installed. ********** ERROR *********** """.strip() ) + return None @classmethod def embedded_docs_path(cls) -> str: @@ -392,3 +441,17 @@ comfyui-workflow-templates is not installed. logging.info("Falling back to the default frontend.") check_frontend_version() return cls.default_frontend_path() + @classmethod + def template_asset_handler(cls): + assets = cls.template_asset_map() + if not assets: + return None + + async def serve_template(request: web.Request) -> web.StreamResponse: + rel_path = request.match_info.get("path", "") + target = assets.get(rel_path) + if target is None: + raise web.HTTPNotFound() + return web.FileResponse(target) + + return serve_template diff --git a/requirements.txt b/requirements.txt index 249c36dee..36c39f338 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.28.8 -comfyui-workflow-templates==0.2.11 +comfyui-workflow-templates==0.3.1 comfyui-embedded-docs==0.3.1 torch torchsde diff --git a/server.py b/server.py index d059d3dc9..d9d5c491f 100644 --- a/server.py +++ b/server.py @@ -30,7 +30,7 @@ import comfy.model_management from comfy_api import feature_flags import node_helpers from comfyui_version import __version__ -from app.frontend_management import FrontendManager +from app.frontend_management import FrontendManager, parse_version from comfy_api.internal import _ComfyNodeInternal from app.user_manager import UserManager @@ -849,11 +849,31 @@ class PromptServer(): for name, dir in nodes.EXTENSION_WEB_DIRS.items(): self.app.add_routes([web.static('/extensions/' + name, dir)]) - workflow_templates_path = FrontendManager.templates_path() - if workflow_templates_path: - self.app.add_routes([ - web.static('/templates', workflow_templates_path) - ]) + installed_templates_version = FrontendManager.get_installed_templates_version() + use_legacy_templates = True + if installed_templates_version: + try: + use_legacy_templates = ( + parse_version(installed_templates_version) + < parse_version("0.3.0") + ) + except Exception as exc: + logging.warning( + "Unable to parse templates version '%s': %s", + installed_templates_version, + exc, + ) + + if use_legacy_templates: + workflow_templates_path = FrontendManager.legacy_templates_path() + if workflow_templates_path: + self.app.add_routes([ + web.static('/templates', workflow_templates_path) + ]) + else: + handler = FrontendManager.template_asset_handler() + if handler: + self.app.router.add_get("/templates/{path:.*}", handler) # Serve embedded documentation from the package embedded_docs_path = FrontendManager.embedded_docs_path() From f5e66d5e47271253edad5c4eddd817b0d6a23340 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 20 Nov 2025 12:08:03 -0800 Subject: [PATCH 590/759] Fix ImageBatch with different channel count. (#10815) --- nodes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nodes.py b/nodes.py index ac14e39a7..75e820e66 100644 --- a/nodes.py +++ b/nodes.py @@ -1852,6 +1852,10 @@ class ImageBatch: CATEGORY = "image" def batch(self, image1, image2): + if image1.shape[-1] != image2.shape[-1]: + channels = min(image1.shape[-1], image2.shape[-1]) + image1 = image1[..., :channels] + image2 = image2[..., :channels] if image1.shape[1:] != image2.shape[1:]: image2 = comfy.utils.common_upscale(image2.movedim(-1,1), image1.shape[2], image1.shape[1], "bilinear", "center").movedim(1,-1) s = torch.cat((image1, image2), dim=0) From 9e00ce5b76ec04be37375310512a443605b95077 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Thu, 20 Nov 2025 14:42:46 -0800 Subject: [PATCH 591/759] Make Batch Images node add alpha channel when one of the inputs has it (#10816) * When one Batch Image input has alpha and one does not, add empty alpha channel * Use torch.nn.functional.pad --- nodes.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nodes.py b/nodes.py index 75e820e66..030371633 100644 --- a/nodes.py +++ b/nodes.py @@ -1853,9 +1853,10 @@ class ImageBatch: def batch(self, image1, image2): if image1.shape[-1] != image2.shape[-1]: - channels = min(image1.shape[-1], image2.shape[-1]) - image1 = image1[..., :channels] - image2 = image2[..., :channels] + if image1.shape[-1] > image2.shape[-1]: + image2 = torch.nn.functional.pad(image2, (0,1), mode='constant', value=1.0) + else: + image1 = torch.nn.functional.pad(image1, (0,1), mode='constant', value=1.0) if image1.shape[1:] != image2.shape[1:]: image2 = comfy.utils.common_upscale(image2.movedim(-1,1), image1.shape[2], image1.shape[1], "bilinear", "center").movedim(1,-1) s = torch.cat((image1, image2), dim=0) From 7b8389578e88dcd13b1cf6aea5404047298c9183 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 21 Nov 2025 02:17:47 +0200 Subject: [PATCH 592/759] feat(api-nodes): add Nano Banana Pro (#10814) * feat(api-nodes): add Nano Banana Pro * frontend bump to 1.28.9 --- comfy_api_nodes/apis/gemini_api.py | 5 +- comfy_api_nodes/nodes_gemini.py | 205 ++++++++++++++++++++++++++++- comfy_api_nodes/util/client.py | 13 +- requirements.txt | 2 +- 4 files changed, 215 insertions(+), 10 deletions(-) diff --git a/comfy_api_nodes/apis/gemini_api.py b/comfy_api_nodes/apis/gemini_api.py index f63e02693..710f173f1 100644 --- a/comfy_api_nodes/apis/gemini_api.py +++ b/comfy_api_nodes/apis/gemini_api.py @@ -68,7 +68,7 @@ class GeminiTextPart(BaseModel): class GeminiContent(BaseModel): - parts: list[GeminiPart] = Field(...) + parts: list[GeminiPart] = Field([]) role: GeminiRole = Field(..., examples=["user"]) @@ -120,7 +120,7 @@ class GeminiGenerationConfig(BaseModel): class GeminiImageConfig(BaseModel): aspectRatio: str | None = Field(None) - resolution: str | None = Field(None) + imageSize: str | None = Field(None) class GeminiImageGenerationConfig(GeminiGenerationConfig): @@ -227,3 +227,4 @@ class GeminiGenerateContentResponse(BaseModel): candidates: list[GeminiCandidate] | None = Field(None) promptFeedback: GeminiPromptFeedback | None = Field(None) usageMetadata: GeminiUsageMetadata | None = Field(None) + modelVersion: str | None = Field(None) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 6e746eebd..be752c885 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -29,11 +29,13 @@ from comfy_api_nodes.apis.gemini_api import ( GeminiMimeType, GeminiPart, GeminiRole, + Modality, ) from comfy_api_nodes.util import ( ApiEndpoint, audio_to_base64_string, bytesio_to_image_tensor, + get_number_of_images, sync_op, tensor_to_base64_string, validate_string, @@ -147,6 +149,49 @@ def get_image_from_response(response: GeminiGenerateContentResponse) -> torch.Te return torch.cat(image_tensors, dim=0) +def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | None: + if not response.modelVersion: + return None + # Define prices (Cost per 1,000,000 tokens), see https://cloud.google.com/vertex-ai/generative-ai/pricing + if response.modelVersion in ("gemini-2.5-pro-preview-05-06", "gemini-2.5-pro"): + input_tokens_price = 1.25 + output_text_tokens_price = 10.0 + output_image_tokens_price = 0.0 + elif response.modelVersion in ( + "gemini-2.5-flash-preview-04-17", + "gemini-2.5-flash", + ): + input_tokens_price = 0.30 + output_text_tokens_price = 2.50 + output_image_tokens_price = 0.0 + elif response.modelVersion in ( + "gemini-2.5-flash-image-preview", + "gemini-2.5-flash-image", + ): + input_tokens_price = 0.30 + output_text_tokens_price = 2.50 + output_image_tokens_price = 30.0 + elif response.modelVersion == "gemini-3-pro-preview": + input_tokens_price = 2 + output_text_tokens_price = 12.0 + output_image_tokens_price = 0.0 + elif response.modelVersion == "gemini-3-pro-image-preview": + input_tokens_price = 2 + output_text_tokens_price = 12.0 + output_image_tokens_price = 120.0 + else: + return None + final_price = response.usageMetadata.promptTokenCount * input_tokens_price + for i in response.usageMetadata.candidatesTokensDetails: + if i.modality == Modality.IMAGE: + final_price += output_image_tokens_price * i.tokenCount # for Nano Banana models + else: + final_price += output_text_tokens_price * i.tokenCount + if response.usageMetadata.thoughtsTokenCount: + final_price += output_text_tokens_price * response.usageMetadata.thoughtsTokenCount + return final_price / 1_000_000.0 + + class GeminiNode(IO.ComfyNode): """ Node to generate text responses from a Gemini model. @@ -314,6 +359,7 @@ class GeminiNode(IO.ComfyNode): ] ), response_model=GeminiGenerateContentResponse, + price_extractor=calculate_tokens_price, ) output_text = get_text_from_response(response) @@ -476,6 +522,13 @@ class GeminiImage(IO.ComfyNode): "or otherwise generates 1:1 squares.", optional=True, ), + IO.Combo.Input( + "response_modalities", + options=["IMAGE+TEXT", "IMAGE"], + tooltip="Choose 'IMAGE' for image-only output, or " + "'IMAGE+TEXT' to return both the generated image and a text response.", + optional=True, + ), ], outputs=[ IO.Image.Output(), @@ -498,6 +551,7 @@ class GeminiImage(IO.ComfyNode): images: torch.Tensor | None = None, files: list[GeminiPart] | None = None, aspect_ratio: str = "auto", + response_modalities: str = "IMAGE+TEXT", ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) parts: list[GeminiPart] = [GeminiPart(text=prompt)] @@ -520,17 +574,16 @@ class GeminiImage(IO.ComfyNode): GeminiContent(role=GeminiRole.user, parts=parts), ], generationConfig=GeminiImageGenerationConfig( - responseModalities=["TEXT", "IMAGE"], + responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]), imageConfig=None if aspect_ratio == "auto" else image_config, ), ), response_model=GeminiGenerateContentResponse, + price_extractor=calculate_tokens_price, ) - output_image = get_image_from_response(response) output_text = get_text_from_response(response) if output_text: - # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. render_spec = { "node_id": cls.hidden.unique_id, "component": "ChatHistoryWidget", @@ -551,9 +604,150 @@ class GeminiImage(IO.ComfyNode): "display_component", render_spec, ) + return IO.NodeOutput(get_image_from_response(response), output_text) - output_text = output_text or "Empty response from Gemini model..." - return IO.NodeOutput(output_image, output_text) + +class GeminiImage2(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="GeminiImage2Node", + display_name="Nano Banana Pro (Google Gemini Image)", + category="api node/image/Gemini", + description="Generate or edit images synchronously via Google Vertex API.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + tooltip="Text prompt describing the image to generate or the edits to apply. " + "Include any constraints, styles, or details the model should follow.", + default="", + ), + IO.Combo.Input( + "model", + options=["gemini-3-pro-image-preview"], + ), + IO.Int.Input( + "seed", + default=42, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="When the seed is fixed to a specific value, the model makes a best effort to provide " + "the same response for repeated requests. Deterministic output isn't guaranteed. " + "Also, changing the model or parameter settings, such as the temperature, " + "can cause variations in the response even when you use the same seed value. " + "By default, a random seed value is used.", + ), + IO.Combo.Input( + "aspect_ratio", + options=["auto", "1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"], + default="auto", + tooltip="If set to 'auto', matches your input image's aspect ratio; " + "if no image is provided, generates a 1:1 square.", + ), + IO.Combo.Input( + "resolution", + options=["1K", "2K", "4K"], + tooltip="Target output resolution. For 2K/4K the native Gemini upscaler is used.", + ), + IO.Combo.Input( + "response_modalities", + options=["IMAGE+TEXT", "IMAGE"], + tooltip="Choose 'IMAGE' for image-only output, or " + "'IMAGE+TEXT' to return both the generated image and a text response.", + ), + IO.Image.Input( + "images", + optional=True, + tooltip="Optional reference image(s). " + "To include multiple images, use the Batch Images node (up to 14).", + ), + IO.Custom("GEMINI_INPUT_FILES").Input( + "files", + optional=True, + tooltip="Optional file(s) to use as context for the model. " + "Accepts inputs from the Gemini Generate Content Input Files node.", + ), + ], + outputs=[ + IO.Image.Output(), + IO.String.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + prompt: str, + model: str, + seed: int, + aspect_ratio: str, + resolution: str, + response_modalities: str, + images: torch.Tensor | None = None, + files: list[GeminiPart] | None = None, + ) -> IO.NodeOutput: + validate_string(prompt, strip_whitespace=True, min_length=1) + + parts: list[GeminiPart] = [GeminiPart(text=prompt)] + if images is not None: + if get_number_of_images(images) > 14: + raise ValueError("The current maximum number of supported images is 14.") + parts.extend(create_image_parts(images)) + if files is not None: + parts.extend(files) + + image_config = GeminiImageConfig(imageSize=resolution) + if aspect_ratio != "auto": + image_config.aspectRatio = aspect_ratio + + response = await sync_op( + cls, + ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"), + data=GeminiImageGenerateContentRequest( + contents=[ + GeminiContent(role=GeminiRole.user, parts=parts), + ], + generationConfig=GeminiImageGenerationConfig( + responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]), + imageConfig=image_config, + ), + ), + response_model=GeminiGenerateContentResponse, + price_extractor=calculate_tokens_price, + ) + + output_text = get_text_from_response(response) + if output_text: + render_spec = { + "node_id": cls.hidden.unique_id, + "component": "ChatHistoryWidget", + "props": { + "history": json.dumps( + [ + { + "prompt": prompt, + "response": output_text, + "response_id": str(uuid.uuid4()), + "timestamp": time.time(), + } + ] + ), + }, + } + PromptServer.instance.send_sync( + "display_component", + render_spec, + ) + return IO.NodeOutput(get_image_from_response(response), output_text) class GeminiExtension(ComfyExtension): @@ -562,6 +756,7 @@ class GeminiExtension(ComfyExtension): return [ GeminiNode, GeminiImage, + GeminiImage2, GeminiInputFiles, ] diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py index ad6e3c0d0..bf01d7d36 100644 --- a/comfy_api_nodes/util/client.py +++ b/comfy_api_nodes/util/client.py @@ -63,6 +63,7 @@ class _RequestConfig: estimated_total: Optional[int] = None final_label_on_success: Optional[str] = "Completed" progress_origin_ts: Optional[float] = None + price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None @dataclass @@ -87,6 +88,7 @@ async def sync_op( endpoint: ApiEndpoint, *, response_model: Type[M], + price_extractor: Optional[Callable[[M], Optional[float]]] = None, data: Optional[BaseModel] = None, files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None, content_type: str = "application/json", @@ -104,6 +106,7 @@ async def sync_op( raw = await sync_op_raw( cls, endpoint, + price_extractor=_wrap_model_extractor(response_model, price_extractor), data=data, files=files, content_type=content_type, @@ -175,6 +178,7 @@ async def sync_op_raw( cls: type[IO.ComfyNode], endpoint: ApiEndpoint, *, + price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None, data: Optional[Union[dict[str, Any], BaseModel]] = None, files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None, content_type: str = "application/json", @@ -216,6 +220,7 @@ async def sync_op_raw( estimated_total=estimated_duration, final_label_on_success=final_label_on_success, progress_origin_ts=progress_origin_ts, + price_extractor=price_extractor, ) return await _request_base(cfg, expect_binary=as_binary) @@ -425,7 +430,8 @@ def _display_text( display_lines.append(f"Status: {status.capitalize() if isinstance(status, str) else status}") if price is not None: p = f"{float(price):,.4f}".rstrip("0").rstrip(".") - display_lines.append(f"Price: ${p}") + if p != "0": + display_lines.append(f"Price: ${p}") if text is not None: display_lines.append(text) if display_lines: @@ -581,6 +587,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): delay = cfg.retry_delay operation_succeeded: bool = False final_elapsed_seconds: Optional[int] = None + extracted_price: Optional[float] = None while True: attempt += 1 stop_event = asyncio.Event() @@ -768,6 +775,8 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): except json.JSONDecodeError: payload = {"_raw": text} response_content_to_log = payload if isinstance(payload, dict) else text + with contextlib.suppress(Exception): + extracted_price = cfg.price_extractor(payload) if cfg.price_extractor else None operation_succeeded = True final_elapsed_seconds = int(time.monotonic() - start_time) try: @@ -872,7 +881,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): else int(time.monotonic() - start_time) ), estimated_total=cfg.estimated_total, - price=None, + price=extracted_price, is_queued=False, processing_elapsed_seconds=final_elapsed_seconds, ) diff --git a/requirements.txt b/requirements.txt index 36c39f338..8c1946f3d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.28.8 +comfyui-frontend-package==1.28.9 comfyui-workflow-templates==0.3.1 comfyui-embedded-docs==0.3.1 torch From b75d349f25ccb702895c6f1b8af7aded63a7f7e2 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 21 Nov 2025 02:33:54 +0200 Subject: [PATCH 593/759] fix(KlingLipSyncAudioToVideoNode): convert audio to mp3 format (#10811) --- comfy_api_nodes/nodes_kling.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 7b23e9cf9..36852038b 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -518,7 +518,9 @@ async def execute_lipsync( # Upload the audio file to Comfy API and get download URL if audio: - audio_url = await upload_audio_to_comfyapi(cls, audio) + audio_url = await upload_audio_to_comfyapi( + cls, audio, container_format="mp3", codec_name="libmp3lame", mime_type="audio/mpeg", filename="output.mp3" + ) logging.info("Uploaded audio to Comfy API. URL: %s", audio_url) else: audio_url = None From 10e90a5757906ecdb71b84d41173813d7f62c140 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Thu, 20 Nov 2025 18:20:52 -0800 Subject: [PATCH 594/759] bump comfyui-workflow-templates for nano banana 2 (#10818) * bump templates * bump templates --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8c1946f3d..624aa7362 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.28.9 -comfyui-workflow-templates==0.3.1 +comfyui-workflow-templates==0.6.0 comfyui-embedded-docs==0.3.1 torch torchsde From 943b3b615d40542ea19bc8ff8ad2950c0a094605 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:44:43 -0800 Subject: [PATCH 595/759] HunyuanVideo 1.5 (#10819) * init * update * Update model.py * Update model.py * remove print * Fix text encoding * Prevent empty negative prompt Really doesn't work otherwise * fp16 works * I2V * Update model_base.py * Update nodes_hunyuan.py * Better latent rgb factors * Use the correct sigclip output... * Support HunyuanVideo1.5 SR model * whitespaces... * Proper latent channel count * SR model fixes This also still needs timesteps scheduling based on the noise scale, can be used with two samplers too already * vae_refiner: roll the convolution through temporal Work in progress. Roll the convolution through time using 2-latent-frame chunks and a FIFO queue for the convolution seams. * Support HunyuanVideo15 latent resampler * fix * Some cleanup Co-Authored-By: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> * Proper hyvid15 I2V channels Co-Authored-By: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> * Fix TokenRefiner for fp16 Otherwise x.sum has infs, just in case only casting if input is fp16, I don't know if necessary. * Bugfix for the HunyuanVideo15 SR model * vae_refiner: roll the convolution through temporal II Roll the convolution through time using 2-latent-frame chunks and a FIFO queue for the convolution seams. Added support for encoder, lowered to 1 latent frame to save more VRAM, made work for Hunyuan Image 3.0 (as code shared). Fixed names, cleaned up code. * Allow any number of input frames in VAE. * Better VAE encode mem estimation. * Lowvram fix. * Fix hunyuan image 2.1 refiner. * Fix mistake. * Name changes. * Rename. * Whitespace. * Fix. * Fix. --------- Co-authored-by: kijai <40791699+kijai@users.noreply.github.com> Co-authored-by: Rattus --- comfy/latent_formats.py | 60 ++++ comfy/ldm/hunyuan_video/model.py | 54 +++- comfy/ldm/hunyuan_video/upsampler.py | 120 ++++++++ comfy/ldm/hunyuan_video/vae_refiner.py | 284 +++++++++++------- comfy/model_base.py | 91 ++++++ comfy/model_detection.py | 10 + comfy/sd.py | 12 +- comfy/supported_models.py | 50 ++- comfy/text_encoders/hunyuan_video.py | 9 + comfy/text_encoders/qwen_image.py | 4 +- comfy_api/latest/_io.py | 4 + comfy_extras/nodes_hunyuan.py | 201 ++++++++++++- folder_paths.py | 2 + .../put_latent_upscale_models_here | 0 nodes.py | 2 +- 15 files changed, 777 insertions(+), 126 deletions(-) create mode 100644 comfy/ldm/hunyuan_video/upsampler.py create mode 100644 models/latent_upscale_models/put_latent_upscale_models_here diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 77e642a94..204fc048d 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -611,6 +611,66 @@ class HunyuanImage21Refiner(LatentFormat): latent_dimensions = 3 scale_factor = 1.03682 + def process_in(self, latent): + out = latent * self.scale_factor + out = torch.cat((out[:, :, :1], out), dim=2) + out = out.permute(0, 2, 1, 3, 4) + b, f_times_2, c, h, w = out.shape + out = out.reshape(b, f_times_2 // 2, 2 * c, h, w) + out = out.permute(0, 2, 1, 3, 4).contiguous() + return out + + def process_out(self, latent): + z = latent / self.scale_factor + z = z.permute(0, 2, 1, 3, 4) + b, f, c, h, w = z.shape + z = z.reshape(b, f, 2, c // 2, h, w) + z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w) + z = z.permute(0, 2, 1, 3, 4) + z = z[:, :, 1:] + return z + +class HunyuanVideo15(LatentFormat): + latent_rgb_factors = [ + [ 0.0568, -0.0521, -0.0131], + [ 0.0014, 0.0735, 0.0326], + [ 0.0186, 0.0531, -0.0138], + [-0.0031, 0.0051, 0.0288], + [ 0.0110, 0.0556, 0.0432], + [-0.0041, -0.0023, -0.0485], + [ 0.0530, 0.0413, 0.0253], + [ 0.0283, 0.0251, 0.0339], + [ 0.0277, -0.0372, -0.0093], + [ 0.0393, 0.0944, 0.1131], + [ 0.0020, 0.0251, 0.0037], + [-0.0017, 0.0012, 0.0234], + [ 0.0468, 0.0436, 0.0203], + [ 0.0354, 0.0439, -0.0233], + [ 0.0090, 0.0123, 0.0346], + [ 0.0382, 0.0029, 0.0217], + [ 0.0261, -0.0300, 0.0030], + [-0.0088, -0.0220, -0.0283], + [-0.0272, -0.0121, -0.0363], + [-0.0664, -0.0622, 0.0144], + [ 0.0414, 0.0479, 0.0529], + [ 0.0355, 0.0612, -0.0247], + [ 0.0147, 0.0264, 0.0174], + [ 0.0438, 0.0038, 0.0542], + [ 0.0431, -0.0573, -0.0033], + [-0.0162, -0.0211, -0.0406], + [-0.0487, -0.0295, -0.0393], + [ 0.0005, -0.0109, 0.0253], + [ 0.0296, 0.0591, 0.0353], + [ 0.0119, 0.0181, -0.0306], + [-0.0085, -0.0362, 0.0229], + [ 0.0005, -0.0106, 0.0242] + ] + + latent_rgb_factors_bias = [ 0.0456, -0.0202, -0.0644] + latent_channels = 32 + latent_dimensions = 3 + scale_factor = 1.03682 + class Hunyuan3Dv2(LatentFormat): latent_channels = 64 latent_dimensions = 1 diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py index 5132e6c07..f75c6e0e1 100644 --- a/comfy/ldm/hunyuan_video/model.py +++ b/comfy/ldm/hunyuan_video/model.py @@ -6,7 +6,6 @@ import comfy.ldm.flux.layers import comfy.ldm.modules.diffusionmodules.mmdit from comfy.ldm.modules.attention import optimized_attention - from dataclasses import dataclass from einops import repeat @@ -42,6 +41,8 @@ class HunyuanVideoParams: guidance_embed: bool byt5: bool meanflow: bool + use_cond_type_embedding: bool + vision_in_dim: int class SelfAttentionRef(nn.Module): @@ -157,7 +158,10 @@ class TokenRefiner(nn.Module): t = self.t_embedder(timestep_embedding(timesteps, 256, time_factor=1.0).to(x.dtype)) # m = mask.float().unsqueeze(-1) # c = (x.float() * m).sum(dim=1) / m.sum(dim=1) #TODO: the following works when the x.shape is the same length as the tokens but might break otherwise - c = x.sum(dim=1) / x.shape[1] + if x.dtype == torch.float16: + c = x.float().sum(dim=1) / x.shape[1] + else: + c = x.sum(dim=1) / x.shape[1] c = t + self.c_embedder(c.to(x.dtype)) x = self.input_embedder(x) @@ -196,11 +200,15 @@ class HunyuanVideo(nn.Module): def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs): super().__init__() self.dtype = dtype + operation_settings = {"operations": operations, "device": device, "dtype": dtype} + params = HunyuanVideoParams(**kwargs) self.params = params self.patch_size = params.patch_size self.in_channels = params.in_channels self.out_channels = params.out_channels + self.use_cond_type_embedding = params.use_cond_type_embedding + self.vision_in_dim = params.vision_in_dim if params.hidden_size % params.num_heads != 0: raise ValueError( f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}" @@ -266,6 +274,18 @@ class HunyuanVideo(nn.Module): if final_layer: self.final_layer = LastLayer(self.hidden_size, self.patch_size[-1], self.out_channels, dtype=dtype, device=device, operations=operations) + # HunyuanVideo 1.5 specific modules + if self.vision_in_dim is not None: + from comfy.ldm.wan.model import MLPProj + self.vision_in = MLPProj(in_dim=self.vision_in_dim, out_dim=self.hidden_size, operation_settings=operation_settings) + else: + self.vision_in = None + if self.use_cond_type_embedding: + # 0: text_encoder feature 1: byt5 feature 2: vision_encoder feature + self.cond_type_embedding = nn.Embedding(3, self.hidden_size) + else: + self.cond_type_embedding = None + def forward_orig( self, img: Tensor, @@ -276,6 +296,7 @@ class HunyuanVideo(nn.Module): timesteps: Tensor, y: Tensor = None, txt_byt5=None, + clip_fea=None, guidance: Tensor = None, guiding_frame_index=None, ref_latent=None, @@ -331,12 +352,31 @@ class HunyuanVideo(nn.Module): txt = self.txt_in(txt, timesteps, txt_mask, transformer_options=transformer_options) + if self.cond_type_embedding is not None: + self.cond_type_embedding.to(txt.device) + cond_emb = self.cond_type_embedding(torch.zeros_like(txt[:, :, 0], device=txt.device, dtype=torch.long)) + txt = txt + cond_emb.to(txt.dtype) + if self.byt5_in is not None and txt_byt5 is not None: txt_byt5 = self.byt5_in(txt_byt5) + if self.cond_type_embedding is not None: + cond_emb = self.cond_type_embedding(torch.ones_like(txt_byt5[:, :, 0], device=txt_byt5.device, dtype=torch.long)) + txt_byt5 = txt_byt5 + cond_emb.to(txt_byt5.dtype) + txt = torch.cat((txt_byt5, txt), dim=1) # byt5 first for HunyuanVideo1.5 + else: + txt = torch.cat((txt, txt_byt5), dim=1) txt_byt5_ids = torch.zeros((txt_ids.shape[0], txt_byt5.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype) - txt = torch.cat((txt, txt_byt5), dim=1) txt_ids = torch.cat((txt_ids, txt_byt5_ids), dim=1) + if clip_fea is not None: + txt_vision_states = self.vision_in(clip_fea) + if self.cond_type_embedding is not None: + cond_emb = self.cond_type_embedding(2 * torch.ones_like(txt_vision_states[:, :, 0], dtype=torch.long, device=txt_vision_states.device)) + txt_vision_states = txt_vision_states + cond_emb + txt = torch.cat((txt_vision_states.to(txt.dtype), txt), dim=1) + extra_txt_ids = torch.zeros((txt_ids.shape[0], txt_vision_states.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype) + txt_ids = torch.cat((txt_ids, extra_txt_ids), dim=1) + ids = torch.cat((img_ids, txt_ids), dim=1) pe = self.pe_embedder(ids) @@ -430,14 +470,14 @@ class HunyuanVideo(nn.Module): img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) return repeat(img_ids, "h w c -> b (h w) c", b=bs) - def forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs): + def forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs): return comfy.patcher_extension.WrapperExecutor.new_class_executor( self._forward, self, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) - ).execute(x, timestep, context, y, txt_byt5, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs) + ).execute(x, timestep, context, y, txt_byt5, clip_fea, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs) - def _forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs): + def _forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs): bs = x.shape[0] if len(self.patch_size) == 3: img_ids = self.img_ids(x) @@ -445,5 +485,5 @@ class HunyuanVideo(nn.Module): else: img_ids = self.img_ids_2d(x) txt_ids = torch.zeros((bs, context.shape[1], 2), device=x.device, dtype=x.dtype) - out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options) + out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, clip_fea, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options) return out diff --git a/comfy/ldm/hunyuan_video/upsampler.py b/comfy/ldm/hunyuan_video/upsampler.py new file mode 100644 index 000000000..9f5e91a59 --- /dev/null +++ b/comfy/ldm/hunyuan_video/upsampler.py @@ -0,0 +1,120 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from comfy.ldm.hunyuan_video.vae_refiner import RMS_norm, ResnetBlock, VideoConv3d +import model_management, model_patcher + +class SRResidualCausalBlock3D(nn.Module): + def __init__(self, channels: int): + super().__init__() + self.block = nn.Sequential( + VideoConv3d(channels, channels, kernel_size=3), + nn.SiLU(inplace=True), + VideoConv3d(channels, channels, kernel_size=3), + nn.SiLU(inplace=True), + VideoConv3d(channels, channels, kernel_size=3), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + self.block(x) + +class SRModel3DV2(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + hidden_channels: int = 64, + num_blocks: int = 6, + global_residual: bool = False, + ): + super().__init__() + self.in_conv = VideoConv3d(in_channels, hidden_channels, kernel_size=3) + self.blocks = nn.ModuleList([SRResidualCausalBlock3D(hidden_channels) for _ in range(num_blocks)]) + self.out_conv = VideoConv3d(hidden_channels, out_channels, kernel_size=3) + self.global_residual = bool(global_residual) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + residual = x + y = self.in_conv(x) + for blk in self.blocks: + y = blk(y) + y = self.out_conv(y) + if self.global_residual and (y.shape == residual.shape): + y = y + residual + return y + + +class Upsampler(nn.Module): + def __init__( + self, + z_channels: int, + out_channels: int, + block_out_channels: tuple[int, ...], + num_res_blocks: int = 2, + ): + super().__init__() + self.num_res_blocks = num_res_blocks + self.block_out_channels = block_out_channels + self.z_channels = z_channels + + ch = block_out_channels[0] + self.conv_in = VideoConv3d(z_channels, ch, kernel_size=3) + + self.up = nn.ModuleList() + + for i, tgt in enumerate(block_out_channels): + stage = nn.Module() + stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, + out_channels=tgt, + temb_channels=0, + conv_shortcut=False, + conv_op=VideoConv3d, norm_op=RMS_norm) + for j in range(num_res_blocks + 1)]) + ch = tgt + self.up.append(stage) + + self.norm_out = RMS_norm(ch) + self.conv_out = VideoConv3d(ch, out_channels, kernel_size=3) + + def forward(self, z): + """ + Args: + z: (B, C, T, H, W) + target_shape: (H, W) + """ + # z to block_in + repeats = self.block_out_channels[0] // (self.z_channels) + x = self.conv_in(z) + z.repeat_interleave(repeats=repeats, dim=1) + + # upsampling + for stage in self.up: + for blk in stage.block: + x = blk(x) + + out = self.conv_out(F.silu(self.norm_out(x))) + return out + +UPSAMPLERS = { + "720p": SRModel3DV2, + "1080p": Upsampler, +} + +class HunyuanVideo15SRModel(): + def __init__(self, model_type, config): + self.load_device = model_management.vae_device() + offload_device = model_management.vae_offload_device() + self.dtype = model_management.vae_dtype(self.load_device) + self.model_class = UPSAMPLERS.get(model_type) + self.model = self.model_class(**config).eval() + + self.patcher = model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device) + + def load_sd(self, sd): + return self.model.load_state_dict(sd, strict=True) + + def get_sd(self): + return self.model.state_dict() + + def resample_latent(self, latent): + model_management.load_model_gpu(self.patcher) + return self.model(latent.to(self.load_device)) diff --git a/comfy/ldm/hunyuan_video/vae_refiner.py b/comfy/ldm/hunyuan_video/vae_refiner.py index c2a0b507d..9f750dcc4 100644 --- a/comfy/ldm/hunyuan_video/vae_refiner.py +++ b/comfy/ldm/hunyuan_video/vae_refiner.py @@ -4,8 +4,40 @@ import torch.nn.functional as F from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d, Normalize import comfy.ops import comfy.ldm.models.autoencoder +import comfy.model_management ops = comfy.ops.disable_weight_init +class NoPadConv3d(nn.Module): + def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding=0, **kwargs): + super().__init__() + self.conv = ops.Conv3d(n_channels, out_channels, kernel_size, stride=stride, dilation=dilation, **kwargs) + + def forward(self, x): + return self.conv(x) + + +def conv_carry_causal_3d(xl, op, conv_carry_in=None, conv_carry_out=None): + + x = xl[0] + xl.clear() + + if conv_carry_out is not None: + to_push = x[:, :, -2:, :, :].clone() + conv_carry_out.append(to_push) + + if isinstance(op, NoPadConv3d): + if conv_carry_in is None: + x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2, 0), mode = 'replicate') + else: + carry_len = conv_carry_in[0].shape[2] + x = torch.cat([conv_carry_in.pop(0), x], dim=2) + x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2 - carry_len, 0), mode = 'replicate') + + out = op(x) + + return out + + class RMS_norm(nn.Module): def __init__(self, dim): super().__init__() @@ -14,7 +46,7 @@ class RMS_norm(nn.Module): self.gamma = nn.Parameter(torch.empty(shape)) def forward(self, x): - return F.normalize(x, dim=1) * self.scale * self.gamma + return F.normalize(x, dim=1) * self.scale * comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device) class DnSmpl(nn.Module): def __init__(self, ic, oc, tds=True, refiner_vae=True, op=VideoConv3d): @@ -27,11 +59,12 @@ class DnSmpl(nn.Module): self.tds = tds self.gs = fct * ic // oc - def forward(self, x): + def forward(self, x, conv_carry_in=None, conv_carry_out=None): r1 = 2 if self.tds else 1 - h = self.conv(x) + h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out) + + if self.tds and self.refiner_vae and conv_carry_in is None: - if self.tds and self.refiner_vae: hf = h[:, :, :1, :, :] b, c, f, ht, wd = hf.shape hf = hf.reshape(b, c, f, ht // 2, 2, wd // 2, 2) @@ -39,14 +72,7 @@ class DnSmpl(nn.Module): hf = hf.reshape(b, 2 * 2 * c, f, ht // 2, wd // 2) hf = torch.cat([hf, hf], dim=1) - hn = h[:, :, 1:, :, :] - b, c, frms, ht, wd = hn.shape - nf = frms // r1 - hn = hn.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2) - hn = hn.permute(0, 3, 5, 7, 1, 2, 4, 6) - hn = hn.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2) - - h = torch.cat([hf, hn], dim=2) + h = h[:, :, 1:, :, :] xf = x[:, :, :1, :, :] b, ci, f, ht, wd = xf.shape @@ -54,34 +80,32 @@ class DnSmpl(nn.Module): xf = xf.permute(0, 4, 6, 1, 2, 3, 5) xf = xf.reshape(b, 2 * 2 * ci, f, ht // 2, wd // 2) B, C, T, H, W = xf.shape - xf = xf.view(B, h.shape[1], self.gs // 2, T, H, W).mean(dim=2) + xf = xf.view(B, hf.shape[1], self.gs // 2, T, H, W).mean(dim=2) - xn = x[:, :, 1:, :, :] - b, ci, frms, ht, wd = xn.shape - nf = frms // r1 - xn = xn.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2) - xn = xn.permute(0, 3, 5, 7, 1, 2, 4, 6) - xn = xn.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2) - B, C, T, H, W = xn.shape - xn = xn.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2) - sc = torch.cat([xf, xn], dim=2) - else: - b, c, frms, ht, wd = h.shape + x = x[:, :, 1:, :, :] - nf = frms // r1 - h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2) - h = h.permute(0, 3, 5, 7, 1, 2, 4, 6) - h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2) + if h.shape[2] == 0: + return hf + xf - b, ci, frms, ht, wd = x.shape - nf = frms // r1 - sc = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2) - sc = sc.permute(0, 3, 5, 7, 1, 2, 4, 6) - sc = sc.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2) - B, C, T, H, W = sc.shape - sc = sc.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2) + b, c, frms, ht, wd = h.shape + nf = frms // r1 + h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2) + h = h.permute(0, 3, 5, 7, 1, 2, 4, 6) + h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2) - return h + sc + b, ci, frms, ht, wd = x.shape + nf = frms // r1 + x = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2) + x = x.permute(0, 3, 5, 7, 1, 2, 4, 6) + x = x.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2) + B, C, T, H, W = x.shape + x = x.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2) + + if self.tds and self.refiner_vae and conv_carry_in is None: + h = torch.cat([hf, h], dim=2) + x = torch.cat([xf, x], dim=2) + + return h + x class UpSmpl(nn.Module): @@ -94,11 +118,11 @@ class UpSmpl(nn.Module): self.tus = tus self.rp = fct * oc // ic - def forward(self, x): + def forward(self, x, conv_carry_in=None, conv_carry_out=None): r1 = 2 if self.tus else 1 - h = self.conv(x) + h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out) - if self.tus and self.refiner_vae: + if self.tus and self.refiner_vae and conv_carry_in is None: hf = h[:, :, :1, :, :] b, c, f, ht, wd = hf.shape nc = c // (2 * 2) @@ -107,14 +131,7 @@ class UpSmpl(nn.Module): hf = hf.reshape(b, nc, f, ht * 2, wd * 2) hf = hf[:, : hf.shape[1] // 2] - hn = h[:, :, 1:, :, :] - b, c, frms, ht, wd = hn.shape - nc = c // (r1 * 2 * 2) - hn = hn.reshape(b, r1, 2, 2, nc, frms, ht, wd) - hn = hn.permute(0, 4, 5, 1, 6, 2, 7, 3) - hn = hn.reshape(b, nc, frms * r1, ht * 2, wd * 2) - - h = torch.cat([hf, hn], dim=2) + h = h[:, :, 1:, :, :] xf = x[:, :, :1, :, :] b, ci, f, ht, wd = xf.shape @@ -125,29 +142,43 @@ class UpSmpl(nn.Module): xf = xf.permute(0, 3, 4, 5, 1, 6, 2) xf = xf.reshape(b, nc, f, ht * 2, wd * 2) - xn = x[:, :, 1:, :, :] - xn = xn.repeat_interleave(repeats=self.rp, dim=1) - b, c, frms, ht, wd = xn.shape - nc = c // (r1 * 2 * 2) - xn = xn.reshape(b, r1, 2, 2, nc, frms, ht, wd) - xn = xn.permute(0, 4, 5, 1, 6, 2, 7, 3) - xn = xn.reshape(b, nc, frms * r1, ht * 2, wd * 2) - sc = torch.cat([xf, xn], dim=2) - else: - b, c, frms, ht, wd = h.shape - nc = c // (r1 * 2 * 2) - h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd) - h = h.permute(0, 4, 5, 1, 6, 2, 7, 3) - h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2) + x = x[:, :, 1:, :, :] - sc = x.repeat_interleave(repeats=self.rp, dim=1) - b, c, frms, ht, wd = sc.shape - nc = c // (r1 * 2 * 2) - sc = sc.reshape(b, r1, 2, 2, nc, frms, ht, wd) - sc = sc.permute(0, 4, 5, 1, 6, 2, 7, 3) - sc = sc.reshape(b, nc, frms * r1, ht * 2, wd * 2) + b, c, frms, ht, wd = h.shape + nc = c // (r1 * 2 * 2) + h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd) + h = h.permute(0, 4, 5, 1, 6, 2, 7, 3) + h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2) - return h + sc + x = x.repeat_interleave(repeats=self.rp, dim=1) + b, c, frms, ht, wd = x.shape + nc = c // (r1 * 2 * 2) + x = x.reshape(b, r1, 2, 2, nc, frms, ht, wd) + x = x.permute(0, 4, 5, 1, 6, 2, 7, 3) + x = x.reshape(b, nc, frms * r1, ht * 2, wd * 2) + + if self.tus and self.refiner_vae and conv_carry_in is None: + h = torch.cat([hf, h], dim=2) + x = torch.cat([xf, x], dim=2) + + return h + x + +class HunyuanRefinerResnetBlock(ResnetBlock): + def __init__(self, in_channels, out_channels, conv_op=NoPadConv3d, norm_op=RMS_norm): + super().__init__(in_channels=in_channels, out_channels=out_channels, temb_channels=0, conv_op=conv_op, norm_op=norm_op) + + def forward(self, x, conv_carry_in=None, conv_carry_out=None): + h = x + h = [ self.swish(self.norm1(x)) ] + h = conv_carry_causal_3d(h, self.conv1, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out) + + h = [ self.dropout(self.swish(self.norm2(h))) ] + h = conv_carry_causal_3d(h, self.conv2, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out) + + if self.in_channels != self.out_channels: + x = self.nin_shortcut(x) + + return x+h class Encoder(nn.Module): def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks, @@ -160,7 +191,7 @@ class Encoder(nn.Module): self.refiner_vae = refiner_vae if self.refiner_vae: - conv_op = VideoConv3d + conv_op = NoPadConv3d norm_op = RMS_norm else: conv_op = ops.Conv3d @@ -175,10 +206,9 @@ class Encoder(nn.Module): for i, tgt in enumerate(block_out_channels): stage = nn.Module() - stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, - out_channels=tgt, - temb_channels=0, - conv_op=conv_op, norm_op=norm_op) + stage.block = nn.ModuleList([HunyuanRefinerResnetBlock(in_channels=ch if j == 0 else tgt, + out_channels=tgt, + conv_op=conv_op, norm_op=norm_op) for j in range(num_res_blocks)]) ch = tgt if i < depth: @@ -188,9 +218,9 @@ class Encoder(nn.Module): self.down.append(stage) self.mid = nn.Module() - self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op) + self.mid.block_1 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op) self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op) - self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op) + self.mid.block_2 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op) self.norm_out = norm_op(ch) self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1) @@ -201,31 +231,50 @@ class Encoder(nn.Module): if not self.refiner_vae and x.shape[2] == 1: x = x.expand(-1, -1, self.ffactor_temporal, -1, -1) - x = self.conv_in(x) + if self.refiner_vae: + xl = [x[:, :, :1, :, :]] + if x.shape[2] > self.ffactor_temporal: + xl += torch.split(x[:, :, 1: 1 + ((x.shape[2] - 1) // self.ffactor_temporal) * self.ffactor_temporal, :, :], self.ffactor_temporal * 2, dim=2) + x = xl + else: + x = [x] + out = [] - for stage in self.down: - for blk in stage.block: - x = blk(x) - if hasattr(stage, 'downsample'): - x = stage.downsample(x) + conv_carry_in = None - x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x))) + for i, x1 in enumerate(x): + conv_carry_out = [] + if i == len(x) - 1: + conv_carry_out = None + x1 = [ x1 ] + x1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out) + + for stage in self.down: + for blk in stage.block: + x1 = blk(x1, conv_carry_in, conv_carry_out) + if hasattr(stage, 'downsample'): + x1 = stage.downsample(x1, conv_carry_in, conv_carry_out) + + out.append(x1) + conv_carry_in = conv_carry_out + + if len(out) > 1: + out = torch.cat(out, dim=2) + else: + out = out[0] + + x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(out))) + del out b, c, t, h, w = x.shape grp = c // (self.z_channels << 1) skip = x.view(b, c // grp, grp, t, h, w).mean(2) - out = self.conv_out(F.silu(self.norm_out(x))) + skip + out = conv_carry_causal_3d([F.silu(self.norm_out(x))], self.conv_out) + skip if self.refiner_vae: out = self.regul(out)[0] - out = torch.cat((out[:, :, :1], out), dim=2) - out = out.permute(0, 2, 1, 3, 4) - b, f_times_2, c, h, w = out.shape - out = out.reshape(b, f_times_2 // 2, 2 * c, h, w) - out = out.permute(0, 2, 1, 3, 4).contiguous() - return out class Decoder(nn.Module): @@ -239,7 +288,7 @@ class Decoder(nn.Module): self.refiner_vae = refiner_vae if self.refiner_vae: - conv_op = VideoConv3d + conv_op = NoPadConv3d norm_op = RMS_norm else: conv_op = ops.Conv3d @@ -249,9 +298,9 @@ class Decoder(nn.Module): self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1) self.mid = nn.Module() - self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op) + self.mid.block_1 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op) self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op) - self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op) + self.mid.block_2 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op) self.up = nn.ModuleList() depth = (ffactor_spatial >> 1).bit_length() @@ -259,10 +308,9 @@ class Decoder(nn.Module): for i, tgt in enumerate(block_out_channels): stage = nn.Module() - stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, - out_channels=tgt, - temb_channels=0, - conv_op=conv_op, norm_op=norm_op) + stage.block = nn.ModuleList([HunyuanRefinerResnetBlock(in_channels=ch if j == 0 else tgt, + out_channels=tgt, + conv_op=conv_op, norm_op=norm_op) for j in range(num_res_blocks + 1)]) ch = tgt if i < depth: @@ -275,27 +323,41 @@ class Decoder(nn.Module): self.conv_out = conv_op(ch, out_channels, 3, stride=1, padding=1) def forward(self, z): - if self.refiner_vae: - z = z.permute(0, 2, 1, 3, 4) - b, f, c, h, w = z.shape - z = z.reshape(b, f, 2, c // 2, h, w) - z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w) - z = z.permute(0, 2, 1, 3, 4) - z = z[:, :, 1:] - - x = self.conv_in(z) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1) + x = conv_carry_causal_3d([z], self.conv_in) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1) x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x))) - for stage in self.up: - for blk in stage.block: - x = blk(x) - if hasattr(stage, 'upsample'): - x = stage.upsample(x) + if self.refiner_vae: + x = torch.split(x, 2, dim=2) + else: + x = [ x ] + out = [] - out = self.conv_out(F.silu(self.norm_out(x))) + conv_carry_in = None + + for i, x1 in enumerate(x): + conv_carry_out = [] + if i == len(x) - 1: + conv_carry_out = None + for stage in self.up: + for blk in stage.block: + x1 = blk(x1, conv_carry_in, conv_carry_out) + if hasattr(stage, 'upsample'): + x1 = stage.upsample(x1, conv_carry_in, conv_carry_out) + + x1 = [ F.silu(self.norm_out(x1)) ] + x1 = conv_carry_causal_3d(x1, self.conv_out, conv_carry_in, conv_carry_out) + out.append(x1) + conv_carry_in = conv_carry_out + del x + + if len(out) > 1: + out = torch.cat(out, dim=2) + else: + out = out[0] if not self.refiner_vae: if z.shape[-3] == 1: out = out[:, :, -1:] return out + diff --git a/comfy/model_base.py b/comfy/model_base.py index 7c788d085..e14b552c5 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1536,3 +1536,94 @@ class HunyuanImage21Refiner(HunyuanImage21): out = super().extra_conds(**kwargs) out['disable_time_r'] = comfy.conds.CONDConstant(True) return out + +class HunyuanVideo15(HunyuanVideo): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device) + + def concat_cond(self, **kwargs): + noise = kwargs.get("noise", None) + extra_channels = self.diffusion_model.img_in.proj.weight.shape[1] - noise.shape[1] - 1 #noise 32 img cond 32 + mask 1 + if extra_channels == 0: + return None + + image = kwargs.get("concat_latent_image", None) + device = kwargs["device"] + + if image is None: + shape_image = list(noise.shape) + shape_image[1] = extra_channels + image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device) + else: + latent_dim = self.latent_format.latent_channels + image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") + for i in range(0, image.shape[1], latent_dim): + image[:, i: i + latent_dim] = self.process_latent_in(image[:, i: i + latent_dim]) + image = utils.resize_to_batch_size(image, noise.shape[0]) + + mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None)) + if mask is None: + mask = torch.zeros_like(noise)[:, :1] + else: + mask = 1.0 - mask + mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") + if mask.shape[-3] < noise.shape[-3]: + mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0) + mask = utils.resize_to_batch_size(mask, noise.shape[0]) + + return torch.cat((image, mask), dim=1) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None: + if torch.numel(attention_mask) != attention_mask.sum(): + out['attention_mask'] = comfy.conds.CONDRegular(attention_mask) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + + conditioning_byt5small = kwargs.get("conditioning_byt5small", None) + if conditioning_byt5small is not None: + out['txt_byt5'] = comfy.conds.CONDRegular(conditioning_byt5small) + + guidance = kwargs.get("guidance", 6.0) + if guidance is not None: + out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance])) + + clip_vision_output = kwargs.get("clip_vision_output", None) + if clip_vision_output is not None: + out['clip_fea'] = comfy.conds.CONDRegular(clip_vision_output.last_hidden_state) + + return out + +class HunyuanVideo15_SR_Distilled(HunyuanVideo15): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device) + + def concat_cond(self, **kwargs): + noise = kwargs.get("noise", None) + image = kwargs.get("concat_latent_image", None) + noise_augmentation = kwargs.get("noise_augmentation", 0.0) + device = kwargs["device"] + + if image is None: + image = torch.zeros([noise.shape[0], noise.shape[1] * 2 + 2, noise.shape[-3], noise.shape[-2], noise.shape[-1]], device=comfy.model_management.intermediate_device()) + else: + image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") + #image = self.process_latent_in(image) # scaling wasn't applied in reference code + image = utils.resize_to_batch_size(image, noise.shape[0]) + lq_image_slice = slice(noise.shape[1] + 1, 2 * noise.shape[1] + 1) + if noise_augmentation > 0: + generator = torch.Generator(device="cpu") + generator.manual_seed(kwargs.get("seed", 0) - 10) + noise = torch.randn(image[:, lq_image_slice].shape, generator=generator, dtype=image.dtype, device="cpu").to(image.device) + image[:, lq_image_slice] = noise_augmentation * noise + min(1.0 - noise_augmentation, 0.75) * image[:, lq_image_slice] + else: + image[:, lq_image_slice] = 0.75 * image[:, lq_image_slice] + return image + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + out['disable_time_r'] = comfy.conds.CONDConstant(False) + return out diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 3142a7fc3..0131ca25a 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -186,6 +186,16 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): guidance_keys = list(filter(lambda a: a.startswith("{}guidance_in.".format(key_prefix)), state_dict_keys)) dit_config["guidance_embed"] = len(guidance_keys) > 0 + + # HunyuanVideo 1.5 + if '{}cond_type_embedding.weight'.format(key_prefix) in state_dict_keys: + dit_config["use_cond_type_embedding"] = True + else: + dit_config["use_cond_type_embedding"] = False + if '{}vision_in.proj.0.weight'.format(key_prefix) in state_dict_keys: + dit_config["vision_in_dim"] = state_dict['{}vision_in.proj.0.weight'.format(key_prefix)].shape[0] + else: + dit_config["vision_in_dim"] = None return dit_config if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight) diff --git a/comfy/sd.py b/comfy/sd.py index 9e5ebbf15..dc0905ada 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -441,20 +441,20 @@ class VAE: elif "decoder.conv_in.conv.weight" in sd and sd['decoder.conv_in.conv.weight'].shape[1] == 32: ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True} ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1] - self.latent_channels = 64 + self.latent_channels = 32 self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16) self.upscale_index_formula = (4, 16, 16) self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16) self.downscale_index_formula = (4, 16, 16) self.latent_dim = 3 - self.not_video = True + self.not_video = False self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.EmptyRegularizer"}, encoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Encoder", 'params': ddconfig}, decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig}) - self.memory_used_encode = lambda shape, dtype: (1400 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype) - self.memory_used_decode = lambda shape, dtype: (1400 * shape[-3] * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype) + self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (2800 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype) elif "decoder.conv_in.conv.weight" in sd: ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0} ddconfig["conv3d"] = True @@ -911,6 +911,7 @@ class CLIPType(Enum): OMNIGEN2 = 17 QWEN_IMAGE = 18 HUNYUAN_IMAGE = 19 + HUNYUAN_VIDEO_15 = 20 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}): @@ -1126,6 +1127,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip elif clip_type == CLIPType.HUNYUAN_IMAGE: clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer + elif clip_type == CLIPType.HUNYUAN_VIDEO_15: + clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer else: clip_target.clip = sdxl_clip.SDXLClipModel clip_target.tokenizer = sdxl_clip.SDXLTokenizer diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 4064bdae1..2e64b85e8 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1374,6 +1374,54 @@ class HunyuanImage21Refiner(HunyuanVideo): out = model_base.HunyuanImage21Refiner(self, device=device) return out -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage] +class HunyuanVideo15(HunyuanVideo): + unet_config = { + "image_model": "hunyuan_video", + "vision_in_dim": 1152, + } + + sampling_settings = { + "shift": 7.0, + } + memory_usage_factor = 4.0 #TODO + supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32] + + latent_format = latent_formats.HunyuanVideo15 + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.HunyuanVideo15(self, device=device) + return out + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) + return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect)) + + +class HunyuanVideo15_SR_Distilled(HunyuanVideo): + unet_config = { + "image_model": "hunyuan_video", + "vision_in_dim": 1152, + "in_channels": 98, + } + + sampling_settings = { + "shift": 2.0, + } + memory_usage_factor = 4.0 #TODO + supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32] + + latent_format = latent_formats.HunyuanVideo15 + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.HunyuanVideo15_SR_Distilled(self, device=device) + return out + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) + return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect)) + +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage] models += [SVD_img2vid] diff --git a/comfy/text_encoders/hunyuan_video.py b/comfy/text_encoders/hunyuan_video.py index b02148b33..557094f49 100644 --- a/comfy/text_encoders/hunyuan_video.py +++ b/comfy/text_encoders/hunyuan_video.py @@ -1,6 +1,7 @@ from comfy import sd1_clip import comfy.model_management import comfy.text_encoders.llama +from .hunyuan_image import HunyuanImageTokenizer from transformers import LlamaTokenizerFast import torch import os @@ -73,6 +74,14 @@ class HunyuanVideoTokenizer: return {} +class HunyuanVideo15Tokenizer(HunyuanImageTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data) + self.llama_template = "<|im_start|>system\nYou are a helpful assistant. Describe the video by detailing the following aspects:\n1. The main content and theme of the video.\n2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.\n3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.\n4. background environment, light, style and atmosphere.\n5. camera angles, movements, and transitions used in the video.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + + def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs): + return super().tokenize_with_weights(text, return_word_ids, prevent_empty_text=True, **kwargs) + class HunyuanVideoClipModel(torch.nn.Module): def __init__(self, dtype_llama=None, device="cpu", dtype=None, model_options={}): super().__init__() diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py index 40fa67937..c0d32a6ef 100644 --- a/comfy/text_encoders/qwen_image.py +++ b/comfy/text_encoders/qwen_image.py @@ -17,12 +17,14 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer): self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n" - def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs): + def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, **kwargs): skip_template = False if text.startswith('<|im_start|>'): skip_template = True if text.startswith('<|start_header_id|>'): skip_template = True + if prevent_empty_text and text == '': + text = ' ' if skip_template: llama_text = text diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 863254ce7..79c0722a9 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -629,6 +629,10 @@ class UpscaleModel(ComfyTypeIO): if TYPE_CHECKING: Type = ImageModelDescriptor +@comfytype(io_type="LATENT_UPSCALE_MODEL") +class LatentUpscaleModel(ComfyTypeIO): + Type = Any + @comfytype(io_type="AUDIO") class Audio(ComfyTypeIO): class AudioDict(TypedDict): diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index f7c34d059..5a2e8cc61 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -4,7 +4,8 @@ import torch import comfy.model_management from typing_extensions import override from comfy_api.latest import ComfyExtension, io - +from comfy.ldm.hunyuan_video.upsampler import HunyuanVideo15SRModel +import folder_paths class CLIPTextEncodeHunyuanDiT(io.ComfyNode): @classmethod @@ -57,6 +58,199 @@ class EmptyHunyuanLatentVideo(io.ComfyNode): generate = execute # TODO: remove +class EmptyHunyuanVideo15Latent(EmptyHunyuanLatentVideo): + @classmethod + def define_schema(cls): + schema = super().define_schema() + schema.node_id = "EmptyHunyuanVideo15Latent" + return schema + + @classmethod + def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput: + # Using scale factor of 16 instead of 8 + latent = torch.zeros([batch_size, 32, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device()) + return io.NodeOutput({"samples": latent}) + + generate = execute # TODO: remove + + +class HunyuanVideo15ImageToVideo(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="HunyuanVideo15ImageToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=33, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("start_image", optional=True), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None) -> io.NodeOutput: + latent = torch.zeros([batch_size, 32, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device()) + + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + + encoded = vae.encode(start_image[:, :, :, :3]) + concat_latent_image = torch.zeros((latent.shape[0], 32, latent.shape[2], latent.shape[3], latent.shape[4]), device=comfy.model_management.intermediate_device()) + concat_latent_image[:, :, :encoded.shape[2], :, :] = encoded + + mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) + mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0 + + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) + + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent) + + +class HunyuanVideo15SuperResolution(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="HunyuanVideo15SuperResolution", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae", optional=True), + io.Image.Input("start_image", optional=True), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + io.Latent.Input("latent"), + io.Float.Input("noise_augmentation", default=0.70, min=0.0, max=1.0, step=0.01), + + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) + + @classmethod + def execute(cls, positive, negative, latent, noise_augmentation, vae=None, start_image=None, clip_vision_output=None) -> io.NodeOutput: + in_latent = latent["samples"] + in_channels = in_latent.shape[1] + cond_latent = torch.zeros([in_latent.shape[0], in_channels * 2 + 2, in_latent.shape[-3], in_latent.shape[-2], in_latent.shape[-1]], device=comfy.model_management.intermediate_device()) + cond_latent[:, in_channels + 1 : 2 * in_channels + 1] = in_latent + cond_latent[:, 2 * in_channels + 1] = 1 + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image.movedim(-1, 1), in_latent.shape[-1] * 16, in_latent.shape[-2] * 16, "bilinear", "center").movedim(1, -1) + encoded = vae.encode(start_image[:, :, :, :3]) + cond_latent[:, :in_channels, :encoded.shape[2], :, :] = encoded + cond_latent[:, in_channels + 1, 0] = 1 + + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": cond_latent, "noise_augmentation": noise_augmentation}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": cond_latent, "noise_augmentation": noise_augmentation}) + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + return io.NodeOutput(positive, negative, latent) + + +class LatentUpscaleModelLoader(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="LatentUpscaleModelLoader", + display_name="Load Latent Upscale Model", + category="loaders", + inputs=[ + io.Combo.Input("model_name", options=folder_paths.get_filename_list("latent_upscale_models")), + ], + outputs=[ + io.LatentUpscaleModel.Output(), + ], + ) + + @classmethod + def execute(cls, model_name) -> io.NodeOutput: + model_path = folder_paths.get_full_path_or_raise("upscale_models", model_name) + sd = comfy.utils.load_torch_file(model_path, safe_load=True) + + if "blocks.0.block.0.conv.weight" in sd: + config = { + "in_channels": sd["in_conv.conv.weight"].shape[1], + "out_channels": sd["out_conv.conv.weight"].shape[0], + "hidden_channels": sd["in_conv.conv.weight"].shape[0], + "num_blocks": len([k for k in sd.keys() if k.startswith("blocks.") and k.endswith(".block.0.conv.weight")]), + "global_residual": False, + } + model_type = "720p" + elif "up.0.block.0.conv1.conv.weight" in sd: + sd = {key.replace("nin_shortcut", "nin_shortcut.conv", 1): value for key, value in sd.items()} + config = { + "z_channels": sd["conv_in.conv.weight"].shape[1], + "out_channels": sd["conv_out.conv.weight"].shape[0], + "block_out_channels": tuple(sd[f"up.{i}.block.0.conv1.conv.weight"].shape[0] for i in range(len([k for k in sd.keys() if k.startswith("up.") and k.endswith(".block.0.conv1.conv.weight")]))), + } + model_type = "1080p" + + model = HunyuanVideo15SRModel(model_type, config) + model.load_sd(sd) + + return io.NodeOutput(model) + + +class HunyuanVideo15LatentUpscaleWithModel(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="HunyuanVideo15LatentUpscaleWithModel", + display_name="Hunyuan Video 15 Latent Upscale With Model", + category="latent", + inputs=[ + io.LatentUpscaleModel.Input("model"), + io.Latent.Input("samples"), + io.Combo.Input("upscale_method", options=["nearest-exact", "bilinear", "area", "bicubic", "bislerp"], default="bilinear"), + io.Int.Input("width", default=1280, min=0, max=16384, step=8), + io.Int.Input("height", default=720, min=0, max=16384, step=8), + io.Combo.Input("crop", options=["disabled", "center"]), + ], + outputs=[ + io.Latent.Output(), + ], + ) + + @classmethod + def execute(cls, model, samples, upscale_method, width, height, crop) -> io.NodeOutput: + if width == 0 and height == 0: + return io.NodeOutput(samples) + else: + if width == 0: + height = max(64, height) + width = max(64, round(samples["samples"].shape[-1] * height / samples["samples"].shape[-2])) + elif height == 0: + width = max(64, width) + height = max(64, round(samples["samples"].shape[-2] * width / samples["samples"].shape[-1])) + else: + width = max(64, width) + height = max(64, height) + s = comfy.utils.common_upscale(samples["samples"], width // 16, height // 16, upscale_method, crop) + s = model.resample_latent(s) + return io.NodeOutput({"samples": s.cpu().float()}) + + PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = ( "<|start_header_id|>system<|end_header_id|>\n\n\nDescribe the video by detailing the following aspects according to the reference image: " "1. The main content and theme of the video." @@ -210,6 +404,11 @@ class HunyuanExtension(ComfyExtension): CLIPTextEncodeHunyuanDiT, TextEncodeHunyuanVideo_ImageToVideo, EmptyHunyuanLatentVideo, + EmptyHunyuanVideo15Latent, + HunyuanVideo15ImageToVideo, + HunyuanVideo15SuperResolution, + HunyuanVideo15LatentUpscaleWithModel, + LatentUpscaleModelLoader, HunyuanImageToVideo, EmptyHunyuanImageLatent, HunyuanRefinerLatent, diff --git a/folder_paths.py b/folder_paths.py index f110d832b..ffdc4d020 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -38,6 +38,8 @@ folder_names_and_paths["gligen"] = ([os.path.join(models_dir, "gligen")], suppor folder_names_and_paths["upscale_models"] = ([os.path.join(models_dir, "upscale_models")], supported_pt_extensions) +folder_names_and_paths["latent_upscale_models"] = ([os.path.join(models_dir, "latent_upscale_models")], supported_pt_extensions) + folder_names_and_paths["custom_nodes"] = ([os.path.join(base_path, "custom_nodes")], set()) folder_names_and_paths["hypernetworks"] = ([os.path.join(models_dir, "hypernetworks")], supported_pt_extensions) diff --git a/models/latent_upscale_models/put_latent_upscale_models_here b/models/latent_upscale_models/put_latent_upscale_models_here new file mode 100644 index 000000000..e69de29bb diff --git a/nodes.py b/nodes.py index 030371633..f023ae3b6 100644 --- a/nodes.py +++ b/nodes.py @@ -957,7 +957,7 @@ class DualCLIPLoader: def INPUT_TYPES(s): return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ), "clip_name2": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image"], ), + "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), From 33981237527a3d84d4e9c3b113f75d6dd37af6a4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 20 Nov 2025 20:39:37 -0800 Subject: [PATCH 596/759] Fix wrong path. (#10821) --- comfy_extras/nodes_hunyuan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index 5a2e8cc61..aa36a471f 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -185,7 +185,7 @@ class LatentUpscaleModelLoader(io.ComfyNode): @classmethod def execute(cls, model_name) -> io.NodeOutput: - model_path = folder_paths.get_full_path_or_raise("upscale_models", model_name) + model_path = folder_paths.get_full_path_or_raise("latent_upscale_models", model_name) sd = comfy.utils.load_torch_file(model_path, safe_load=True) if "blocks.0.block.0.conv.weight" in sd: From c55fd7481626d8bee8044ea7512ea996d13a1b90 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Fri, 21 Nov 2025 00:49:13 -0500 Subject: [PATCH 597/759] ComfyUI 0.3.71 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 9b77aabe9..b4655d553 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.70" +__version__ = "0.3.71" diff --git a/pyproject.toml b/pyproject.toml index 289b7145b..280dbaf53 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.70" +version = "0.3.71" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From ecb683b057a19f1a05d18d6d0b0ee9a6c6c8f4a0 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Fri, 21 Nov 2025 13:34:47 -0800 Subject: [PATCH 598/759] update frontend to 1.30 (#10793) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 624aa7362..f83d561c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.28.9 +comfyui-frontend-package==1.30.6 comfyui-workflow-templates==0.6.0 comfyui-embedded-docs==0.3.1 torch From 532938b16b544e4492ba0ffbe18b201b1a7bc55f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 21 Nov 2025 14:51:55 -0800 Subject: [PATCH 599/759] --disable-api-nodes now sets CSP header to force frontend offline. (#10829) --- comfy/cli_args.py | 2 +- server.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 2f30b72d2..d2b60e347 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -160,7 +160,7 @@ parser.add_argument("--windows-standalone-build", action="store_true", help="Win parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.") parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Disable loading all custom nodes.") parser.add_argument("--whitelist-custom-nodes", type=str, nargs='+', default=[], help="Specify custom node folders to load even when --disable-all-custom-nodes is enabled.") -parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes.") +parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes. Also prevents the frontend from communicating with the internet.") parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.") diff --git a/server.py b/server.py index d9d5c491f..0fd2e49e3 100644 --- a/server.py +++ b/server.py @@ -164,6 +164,22 @@ def create_origin_only_middleware(): return origin_only_middleware + +def create_block_external_middleware(): + @web.middleware + async def block_external_middleware(request: web.Request, handler): + if request.method == "OPTIONS": + # Pre-flight request. Reply successfully: + response = web.Response() + else: + response = await handler(request) + + response.headers['Content-Security-Policy'] = "default-src 'self'; script-src 'self' 'unsafe-inline' blob:; style-src 'self' 'unsafe-inline'; img-src 'self' data: blob:; font-src 'self'; connect-src 'self'; frame-src 'self'; object-src 'self';" + return response + + return block_external_middleware + + class PromptServer(): def __init__(self, loop): PromptServer.instance = self @@ -193,6 +209,9 @@ class PromptServer(): else: middlewares.append(create_origin_only_middleware()) + if args.disable_api_nodes: + middlewares.append(create_block_external_middleware()) + max_upload_size = round(args.max_upload_size * 1024 * 1024) self.app = web.Application(client_max_size=max_upload_size, middlewares=middlewares) self.sockets = dict() From a9c35256bccd4018fbe74bf1e857cc18bd1900ed Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Sat, 22 Nov 2025 02:28:29 -0800 Subject: [PATCH 600/759] Update requirements.txt (#10834) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f83d561c9..8e308cd6c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.30.6 -comfyui-workflow-templates==0.6.0 +comfyui-workflow-templates==0.7.9 comfyui-embedded-docs==0.3.1 torch torchsde From d89c29f25992713ec3102017c189858a457f1215 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 22 Nov 2025 19:51:53 -0800 Subject: [PATCH 601/759] Add display names to Hunyuan latent video nodes. (#10837) --- comfy_extras/nodes_hunyuan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index aa36a471f..32be182f1 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -38,6 +38,7 @@ class EmptyHunyuanLatentVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="EmptyHunyuanLatentVideo", + display_name="Empty HunyuanVideo 1.0 Latent", category="latent/video", inputs=[ io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -63,6 +64,7 @@ class EmptyHunyuanVideo15Latent(EmptyHunyuanLatentVideo): def define_schema(cls): schema = super().define_schema() schema.node_id = "EmptyHunyuanVideo15Latent" + schema.display_name = "Empty HunyuanVideo 1.5 Latent" return schema @classmethod @@ -71,8 +73,6 @@ class EmptyHunyuanVideo15Latent(EmptyHunyuanLatentVideo): latent = torch.zeros([batch_size, 32, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device()) return io.NodeOutput({"samples": latent}) - generate = execute # TODO: remove - class HunyuanVideo15ImageToVideo(io.ComfyNode): @classmethod From cbd68e3d587a1b345bdc6ebcd8a8c6ba1a9d3af3 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 23 Nov 2025 01:55:22 -0800 Subject: [PATCH 602/759] Add better error message for common error. (#10846) --- comfy/cldm/cldm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comfy/cldm/cldm.py b/comfy/cldm/cldm.py index ec01665e2..c93c2e909 100644 --- a/comfy/cldm/cldm.py +++ b/comfy/cldm/cldm.py @@ -413,7 +413,8 @@ class ControlNet(nn.Module): out_middle = [] if self.num_classes is not None: - assert y.shape[0] == x.shape[0] + if y is None: + raise ValueError("y is None, did you try using a controlnet for SDXL on SD1?") emb = emb + self.label_emb(y) h = x From f66183a54142be693ab858e9f1f06ed62439a92e Mon Sep 17 00:00:00 2001 From: guill Date: Sun, 23 Nov 2025 22:56:20 -0800 Subject: [PATCH 603/759] [fix] Fixes non-async public API access (#10857) It looks like the synchronous version of the public API broke due to an addition of `from __future__ import annotations`. This change updates the async-to-sync adapter to work with both types of type annotations. --- comfy_api/internal/async_to_sync.py | 47 ++++++--- tests/execution/test_public_api.py | 153 ++++++++++++++++++++++++++++ 2 files changed, 184 insertions(+), 16 deletions(-) create mode 100644 tests/execution/test_public_api.py diff --git a/comfy_api/internal/async_to_sync.py b/comfy_api/internal/async_to_sync.py index f5f805a62..257ade82e 100644 --- a/comfy_api/internal/async_to_sync.py +++ b/comfy_api/internal/async_to_sync.py @@ -8,7 +8,7 @@ import os import textwrap import threading from enum import Enum -from typing import Optional, Type, get_origin, get_args +from typing import Optional, Type, get_origin, get_args, get_type_hints class TypeTracker: @@ -220,11 +220,18 @@ class AsyncToSyncConverter: self._async_instance = async_class(*args, **kwargs) # Handle annotated class attributes (like execution: Execution) - # Get all annotations from the class hierarchy - all_annotations = {} - for base_class in reversed(inspect.getmro(async_class)): - if hasattr(base_class, "__annotations__"): - all_annotations.update(base_class.__annotations__) + # Get all annotations from the class hierarchy and resolve string annotations + try: + # get_type_hints resolves string annotations to actual type objects + # This handles classes using 'from __future__ import annotations' + all_annotations = get_type_hints(async_class) + except Exception: + # Fallback to raw annotations if get_type_hints fails + # (e.g., for undefined forward references) + all_annotations = {} + for base_class in reversed(inspect.getmro(async_class)): + if hasattr(base_class, "__annotations__"): + all_annotations.update(base_class.__annotations__) # For each annotated attribute, check if it needs to be created or wrapped for attr_name, attr_type in all_annotations.items(): @@ -625,15 +632,19 @@ class AsyncToSyncConverter: """Extract class attributes that are classes themselves.""" class_attributes = [] + # Get resolved type hints to handle string annotations + try: + type_hints = get_type_hints(async_class) + except Exception: + type_hints = {} + # Look for class attributes that are classes for name, attr in sorted(inspect.getmembers(async_class)): if isinstance(attr, type) and not name.startswith("_"): class_attributes.append((name, attr)) - elif ( - hasattr(async_class, "__annotations__") - and name in async_class.__annotations__ - ): - annotation = async_class.__annotations__[name] + elif name in type_hints: + # Use resolved type hint instead of raw annotation + annotation = type_hints[name] if isinstance(annotation, type): class_attributes.append((name, annotation)) @@ -908,11 +919,15 @@ class AsyncToSyncConverter: attribute_mappings = {} # First check annotations for typed attributes (including from parent classes) - # Collect all annotations from the class hierarchy - all_annotations = {} - for base_class in reversed(inspect.getmro(async_class)): - if hasattr(base_class, "__annotations__"): - all_annotations.update(base_class.__annotations__) + # Resolve string annotations to actual types + try: + all_annotations = get_type_hints(async_class) + except Exception: + # Fallback to raw annotations + all_annotations = {} + for base_class in reversed(inspect.getmro(async_class)): + if hasattr(base_class, "__annotations__"): + all_annotations.update(base_class.__annotations__) for attr_name, attr_type in sorted(all_annotations.items()): for class_name, class_type in class_attributes: diff --git a/tests/execution/test_public_api.py b/tests/execution/test_public_api.py new file mode 100644 index 000000000..52bc2fcd8 --- /dev/null +++ b/tests/execution/test_public_api.py @@ -0,0 +1,153 @@ +""" +Tests for public ComfyAPI and ComfyAPISync functions. + +These tests verify that the public API methods work correctly in both sync and async contexts, +ensuring that the sync wrapper generation (via get_type_hints() in async_to_sync.py) correctly +handles string annotations from 'from __future__ import annotations'. +""" + +import pytest +import time +import subprocess +import torch +from pytest import fixture +from comfy_execution.graph_utils import GraphBuilder +from tests.execution.test_execution import ComfyClient + + +@pytest.mark.execution +class TestPublicAPI: + """Test suite for public ComfyAPI and ComfyAPISync methods.""" + + @fixture(scope="class", autouse=True) + def _server(self, args_pytest): + """Start ComfyUI server for testing.""" + pargs = [ + 'python', 'main.py', + '--output-directory', args_pytest["output_dir"], + '--listen', args_pytest["listen"], + '--port', str(args_pytest["port"]), + '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml', + '--cpu', + ] + p = subprocess.Popen(pargs) + yield + p.kill() + torch.cuda.empty_cache() + + @fixture(scope="class", autouse=True) + def shared_client(self, args_pytest, _server): + """Create shared client with connection retry.""" + client = ComfyClient() + n_tries = 5 + for i in range(n_tries): + time.sleep(4) + try: + client.connect(listen=args_pytest["listen"], port=args_pytest["port"]) + break + except ConnectionRefusedError: + if i == n_tries - 1: + raise + yield client + del client + torch.cuda.empty_cache() + + @fixture + def client(self, shared_client, request): + """Set test name for each test.""" + shared_client.set_test_name(f"public_api[{request.node.name}]") + yield shared_client + + @fixture + def builder(self, request): + """Create GraphBuilder for each test.""" + yield GraphBuilder(prefix=request.node.name) + + def test_sync_progress_update_executes(self, client: ComfyClient, builder: GraphBuilder): + """Test that TestSyncProgressUpdate executes without errors. + + This test validates that api_sync.execution.set_progress() works correctly, + which is the primary code path fixed by adding get_type_hints() to async_to_sync.py. + """ + g = builder + image = g.node("StubImage", content="BLACK", height=256, width=256, batch_size=1) + + # Use TestSyncProgressUpdate with short sleep + progress_node = g.node("TestSyncProgressUpdate", + value=image.out(0), + sleep_seconds=0.5) + output = g.node("SaveImage", images=progress_node.out(0)) + + # Execute workflow + result = client.run(g) + + # Verify execution + assert result.did_run(progress_node), "Progress node should have executed" + assert result.did_run(output), "Output node should have executed" + + # Verify output + images = result.get_images(output) + assert len(images) == 1, "Should have produced 1 image" + + def test_async_progress_update_executes(self, client: ComfyClient, builder: GraphBuilder): + """Test that TestAsyncProgressUpdate executes without errors. + + This test validates that await api.execution.set_progress() works correctly + in async contexts. + """ + g = builder + image = g.node("StubImage", content="WHITE", height=256, width=256, batch_size=1) + + # Use TestAsyncProgressUpdate with short sleep + progress_node = g.node("TestAsyncProgressUpdate", + value=image.out(0), + sleep_seconds=0.5) + output = g.node("SaveImage", images=progress_node.out(0)) + + # Execute workflow + result = client.run(g) + + # Verify execution + assert result.did_run(progress_node), "Async progress node should have executed" + assert result.did_run(output), "Output node should have executed" + + # Verify output + images = result.get_images(output) + assert len(images) == 1, "Should have produced 1 image" + + def test_sync_and_async_progress_together(self, client: ComfyClient, builder: GraphBuilder): + """Test both sync and async progress updates in same workflow. + + This test ensures that both ComfyAPISync and ComfyAPI can coexist and work + correctly in the same workflow execution. + """ + g = builder + image1 = g.node("StubImage", content="BLACK", height=256, width=256, batch_size=1) + image2 = g.node("StubImage", content="WHITE", height=256, width=256, batch_size=1) + + # Use both types of progress nodes + sync_progress = g.node("TestSyncProgressUpdate", + value=image1.out(0), + sleep_seconds=0.3) + async_progress = g.node("TestAsyncProgressUpdate", + value=image2.out(0), + sleep_seconds=0.3) + + # Create outputs + output1 = g.node("SaveImage", images=sync_progress.out(0)) + output2 = g.node("SaveImage", images=async_progress.out(0)) + + # Execute workflow + result = client.run(g) + + # Both should execute successfully + assert result.did_run(sync_progress), "Sync progress node should have executed" + assert result.did_run(async_progress), "Async progress node should have executed" + assert result.did_run(output1), "First output node should have executed" + assert result.did_run(output2), "Second output node should have executed" + + # Verify outputs + images1 = result.get_images(output1) + images2 = result.get_images(output2) + assert len(images1) == 1, "Should have produced 1 image from sync node" + assert len(images2) == 1, "Should have produced 1 image from async node" From 3bd71554a2df14b862cc5e1e875df37ba24af1ac Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 24 Nov 2025 19:48:37 +0200 Subject: [PATCH 604/759] fix(api-nodes): edge cases in responses for Gemini models (#10860) --- comfy_api_nodes/apis/gemini_api.py | 6 +++--- comfy_api_nodes/nodes_gemini.py | 21 +++++++++++---------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/comfy_api_nodes/apis/gemini_api.py b/comfy_api_nodes/apis/gemini_api.py index 710f173f1..d34590d28 100644 --- a/comfy_api_nodes/apis/gemini_api.py +++ b/comfy_api_nodes/apis/gemini_api.py @@ -113,9 +113,9 @@ class GeminiGenerationConfig(BaseModel): maxOutputTokens: int | None = Field(None, ge=16, le=8192) seed: int | None = Field(None) stopSequences: list[str] | None = Field(None) - temperature: float | None = Field(1, ge=0.0, le=2.0) - topK: int | None = Field(40, ge=1) - topP: float | None = Field(0.95, ge=0.0, le=1.0) + temperature: float | None = Field(None, ge=0.0, le=2.0) + topK: int | None = Field(None, ge=1) + topP: float | None = Field(None, ge=0.0, le=1.0) class GeminiImageConfig(BaseModel): diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index be752c885..938a20f84 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -104,14 +104,14 @@ def get_parts_by_type(response: GeminiGenerateContentResponse, part_type: Litera List of response parts matching the requested type. """ if response.candidates is None: - if response.promptFeedback.blockReason: + if response.promptFeedback and response.promptFeedback.blockReason: feedback = response.promptFeedback raise ValueError( f"Gemini API blocked the request. Reason: {feedback.blockReason} ({feedback.blockReasonMessage})" ) - raise NotImplementedError( - "Gemini returned no response candidates. " - "Please report to ComfyUI repository with the example of workflow to reproduce this." + raise ValueError( + "Gemini API returned no response candidates. If you are using the `IMAGE` modality, " + "try changing it to `IMAGE+TEXT` to view the model's reasoning and understand why image generation failed." ) parts = [] for part in response.candidates[0].content.parts: @@ -182,11 +182,12 @@ def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | N else: return None final_price = response.usageMetadata.promptTokenCount * input_tokens_price - for i in response.usageMetadata.candidatesTokensDetails: - if i.modality == Modality.IMAGE: - final_price += output_image_tokens_price * i.tokenCount # for Nano Banana models - else: - final_price += output_text_tokens_price * i.tokenCount + if response.usageMetadata.candidatesTokensDetails: + for i in response.usageMetadata.candidatesTokensDetails: + if i.modality == Modality.IMAGE: + final_price += output_image_tokens_price * i.tokenCount # for Nano Banana models + else: + final_price += output_text_tokens_price * i.tokenCount if response.usageMetadata.thoughtsTokenCount: final_price += output_text_tokens_price * response.usageMetadata.thoughtsTokenCount return final_price / 1_000_000.0 @@ -645,7 +646,7 @@ class GeminiImage2(IO.ComfyNode): options=["auto", "1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"], default="auto", tooltip="If set to 'auto', matches your input image's aspect ratio; " - "if no image is provided, generates a 1:1 square.", + "if no image is provided, a 16:9 square is usually generated.", ), IO.Combo.Input( "resolution", From 1286fcfe40b98052e4edbe9a02f12ad89ac74924 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 24 Nov 2025 20:24:29 +0200 Subject: [PATCH 605/759] add get_frame_count and get_frame_rate methods to VideoInput class (#10851) --- comfy_api/latest/_input/video_types.py | 28 ++++++++ comfy_api/latest/_input_impl/video_types.py | 72 +++++++++++++++++++++ comfy_api_nodes/nodes_topaz.py | 15 ++--- 3 files changed, 106 insertions(+), 9 deletions(-) diff --git a/comfy_api/latest/_input/video_types.py b/comfy_api/latest/_input/video_types.py index a335df4d0..87c81d73a 100644 --- a/comfy_api/latest/_input/video_types.py +++ b/comfy_api/latest/_input/video_types.py @@ -1,5 +1,6 @@ from __future__ import annotations from abc import ABC, abstractmethod +from fractions import Fraction from typing import Optional, Union, IO import io import av @@ -72,6 +73,33 @@ class VideoInput(ABC): frame_count = components.images.shape[0] return float(frame_count / components.frame_rate) + def get_frame_count(self) -> int: + """ + Returns the number of frames in the video. + + Default implementation uses :meth:`get_components`, which may require + loading all frames into memory. File-based implementations should + override this method and use container/stream metadata instead. + + Returns: + Total number of frames as an integer. + """ + return int(self.get_components().images.shape[0]) + + def get_frame_rate(self) -> Fraction: + """ + Returns the frame rate of the video. + + Default implementation materializes the video into memory via + `get_components()`. Subclasses that can inspect the underlying + container (e.g. `VideoFromFile`) should override this with a more + efficient implementation. + + Returns: + Frame rate as a Fraction. + """ + return self.get_components().frame_rate + def get_container_format(self) -> str: """ Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py index f646504c8..bde37f90a 100644 --- a/comfy_api/latest/_input_impl/video_types.py +++ b/comfy_api/latest/_input_impl/video_types.py @@ -121,6 +121,71 @@ class VideoFromFile(VideoInput): raise ValueError(f"Could not determine duration for file '{self.__file}'") + def get_frame_count(self) -> int: + """ + Returns the number of frames in the video without materializing them as + torch tensors. + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) + + with av.open(self.__file, mode="r") as container: + video_stream = self._get_first_video_stream(container) + # 1. Prefer the frames field if available + if video_stream.frames and video_stream.frames > 0: + return int(video_stream.frames) + + # 2. Try to estimate from duration and average_rate using only metadata + if container.duration is not None and video_stream.average_rate: + duration_seconds = float(container.duration / av.time_base) + estimated_frames = int(round(duration_seconds * float(video_stream.average_rate))) + if estimated_frames > 0: + return estimated_frames + + if ( + getattr(video_stream, "duration", None) is not None + and getattr(video_stream, "time_base", None) is not None + and video_stream.average_rate + ): + duration_seconds = float(video_stream.duration * video_stream.time_base) + estimated_frames = int(round(duration_seconds * float(video_stream.average_rate))) + if estimated_frames > 0: + return estimated_frames + + # 3. Last resort: decode frames and count them (streaming) + frame_count = 0 + container.seek(0) + for packet in container.demux(video_stream): + for _ in packet.decode(): + frame_count += 1 + + if frame_count == 0: + raise ValueError(f"Could not determine frame count for file '{self.__file}'") + return frame_count + + def get_frame_rate(self) -> Fraction: + """ + Returns the average frame rate of the video using container metadata + without decoding all frames. + """ + if isinstance(self.__file, io.BytesIO): + self.__file.seek(0) + + with av.open(self.__file, mode="r") as container: + video_stream = self._get_first_video_stream(container) + # Preferred: use PyAV's average_rate (usually already a Fraction-like) + if video_stream.average_rate: + return Fraction(video_stream.average_rate) + + # Fallback: estimate from frames + duration if available + if video_stream.frames and container.duration: + duration_seconds = float(container.duration / av.time_base) + if duration_seconds > 0: + return Fraction(video_stream.frames / duration_seconds).limit_denominator() + + # Last resort: match get_components_internal default + return Fraction(1) + def get_container_format(self) -> str: """ Returns the container format of the video (e.g., 'mp4', 'mov', 'avi'). @@ -238,6 +303,13 @@ class VideoFromFile(VideoInput): packet.stream = stream_map[packet.stream] output_container.mux(packet) + def _get_first_video_stream(self, container: InputContainer): + video_stream = next((s for s in container.streams if s.type == "video"), None) + if video_stream is None: + raise ValueError(f"No video stream found in file '{self.__file}'") + return video_stream + + class VideoFromComponents(VideoInput): """ Class representing video input from tensors. diff --git a/comfy_api_nodes/nodes_topaz.py b/comfy_api_nodes/nodes_topaz.py index 79c7bf43d..f522756e5 100644 --- a/comfy_api_nodes/nodes_topaz.py +++ b/comfy_api_nodes/nodes_topaz.py @@ -5,8 +5,7 @@ import aiohttp import torch from typing_extensions import override -from comfy_api.input.video_types import VideoInput -from comfy_api.latest import IO, ComfyExtension +from comfy_api.latest import IO, ComfyExtension, Input from comfy_api_nodes.apis import topaz_api from comfy_api_nodes.util import ( ApiEndpoint, @@ -282,7 +281,7 @@ class TopazVideoEnhance(IO.ComfyNode): @classmethod async def execute( cls, - video: VideoInput, + video: Input.Video, upscaler_enabled: bool, upscaler_model: str, upscaler_resolution: str, @@ -297,12 +296,10 @@ class TopazVideoEnhance(IO.ComfyNode): ) -> IO.NodeOutput: if upscaler_enabled is False and interpolation_enabled is False: raise ValueError("There is nothing to do: both upscaling and interpolation are disabled.") - src_width, src_height = video.get_dimensions() - video_components = video.get_components() - src_frame_rate = int(video_components.frame_rate) - duration_sec = video.get_duration() - estimated_frames = int(duration_sec * src_frame_rate) validate_container_format_is_mp4(video) + src_width, src_height = video.get_dimensions() + src_frame_rate = int(video.get_frame_rate()) + duration_sec = video.get_duration() src_video_stream = video.get_stream_source() target_width = src_width target_height = src_height @@ -338,7 +335,7 @@ class TopazVideoEnhance(IO.ComfyNode): container="mp4", size=get_fs_object_size(src_video_stream), duration=int(duration_sec), - frameCount=estimated_frames, + frameCount=video.get_frame_count(), frameRate=src_frame_rate, resolution=topaz_api.Resolution(width=src_width, height=src_height), ), From 3d1fdaf9f448b34e4eba68bfd8e8de373ec0d22d Mon Sep 17 00:00:00 2001 From: Haoming <73768377+Haoming02@users.noreply.github.com> Date: Tue, 25 Nov 2025 02:30:40 +0800 Subject: [PATCH 606/759] block info (#10843) --- comfy/ldm/chroma/model.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py index 67bf70eb1..a72f8cc47 100644 --- a/comfy/ldm/chroma/model.py +++ b/comfy/ldm/chroma/model.py @@ -179,7 +179,10 @@ class Chroma(nn.Module): pe = self.pe_embedder(ids) blocks_replace = patches_replace.get("dit", {}) + transformer_options["total_blocks"] = len(self.double_blocks) + transformer_options["block_type"] = "double" for i, block in enumerate(self.double_blocks): + transformer_options["block_index"] = i if i not in self.skip_mmdit: double_mod = ( self.get_modulations(mod_vectors, "double_img", idx=i), @@ -222,7 +225,10 @@ class Chroma(nn.Module): img = torch.cat((txt, img), 1) + transformer_options["total_blocks"] = len(self.single_blocks) + transformer_options["block_type"] = "single" for i, block in enumerate(self.single_blocks): + transformer_options["block_index"] = i if i not in self.skip_dit: single_mod = self.get_modulations(mod_vectors, "single", idx=i) if ("single_block", i) in blocks_replace: From 6a6d456c88723538e3d0e5e942f78109ece5b73d Mon Sep 17 00:00:00 2001 From: Haoming <73768377+Haoming02@users.noreply.github.com> Date: Tue, 25 Nov 2025 02:38:38 +0800 Subject: [PATCH 607/759] block info (#10842) --- comfy/ldm/qwen_image/model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 427ea19c1..8c75670cd 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -439,7 +439,10 @@ class QwenImageTransformer2DModel(nn.Module): patches = transformer_options.get("patches", {}) blocks_replace = patches_replace.get("dit", {}) + transformer_options["total_blocks"] = len(self.transformer_blocks) + transformer_options["block_type"] = "double" for i, block in enumerate(self.transformer_blocks): + transformer_options["block_index"] = i if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} From b2ef58e2b17e73ca8cd376a1cdc976518ebbc168 Mon Sep 17 00:00:00 2001 From: Haoming <73768377+Haoming02@users.noreply.github.com> Date: Tue, 25 Nov 2025 02:40:09 +0800 Subject: [PATCH 608/759] block info (#10844) --- comfy/ldm/hunyuan_video/model.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py index f75c6e0e1..2749c53f5 100644 --- a/comfy/ldm/hunyuan_video/model.py +++ b/comfy/ldm/hunyuan_video/model.py @@ -389,7 +389,10 @@ class HunyuanVideo(nn.Module): attn_mask = None blocks_replace = patches_replace.get("dit", {}) + transformer_options["total_blocks"] = len(self.double_blocks) + transformer_options["block_type"] = "double" for i, block in enumerate(self.double_blocks): + transformer_options["block_index"] = i if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} @@ -411,7 +414,10 @@ class HunyuanVideo(nn.Module): img = torch.cat((img, txt), 1) + transformer_options["total_blocks"] = len(self.single_blocks) + transformer_options["block_type"] = "single" for i, block in enumerate(self.single_blocks): + transformer_options["block_index"] = i if ("single_block", i) in blocks_replace: def block_wrap(args): out = {} From 22a2644e57530ee40e13486ccd7c953b87072093 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 24 Nov 2025 16:45:54 -0800 Subject: [PATCH 609/759] Bump transformers version in requirements.txt (#10869) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8e308cd6c..b7014f956 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ torchvision torchaudio numpy>=1.25.0 einops -transformers>=4.37.2 +transformers>=4.50.3 tokenizers>=0.13.3 sentencepiece safetensors>=0.4.2 From 25022e0b0965975b35bcaf28b153184d60a4f9de Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 24 Nov 2025 22:48:53 -0800 Subject: [PATCH 610/759] Cleanup and fix issues with text encoder quants. (#10872) --- comfy/model_patcher.py | 3 +- comfy/ops.py | 168 +++++++++--------- comfy/quant_ops.py | 12 ++ comfy/sd.py | 9 +- comfy/sd1_clip.py | 18 +- comfy/text_encoders/hunyuan_video.py | 3 + .../comfy_quant/test_mixed_precision.py | 17 +- 7 files changed, 128 insertions(+), 102 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index cf1b0d441..6551ced5a 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -231,7 +231,6 @@ class ModelPatcher: self.object_patches_backup = {} self.weight_wrapper_patches = {} self.model_options = {"transformer_options":{}} - self.model_size() self.load_device = load_device self.offload_device = offload_device self.weight_inplace_update = weight_inplace_update @@ -286,7 +285,7 @@ class ModelPatcher: return self.model.lowvram_patch_counter def clone(self): - n = self.__class__(self.model, self.load_device, self.offload_device, self.size, weight_inplace_update=self.weight_inplace_update) + n = self.__class__(self.model, self.load_device, self.offload_device, self.model_size(), weight_inplace_update=self.weight_inplace_update) n.patches = {} for k in self.patches: n.patches[k] = self.patches[k][:] diff --git a/comfy/ops.py b/comfy/ops.py index 640622fd1..af185ec24 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -540,113 +540,115 @@ if CUBLAS_IS_AVAILABLE: # ============================================================================== from .quant_ops import QuantizedTensor, QUANT_ALGOS -class MixedPrecisionOps(disable_weight_init): - _layer_quant_config = {} - _compute_dtype = torch.bfloat16 - class Linear(torch.nn.Module, CastWeightBiasOp): - def __init__( - self, - in_features: int, - out_features: int, - bias: bool = True, - device=None, - dtype=None, - ) -> None: - super().__init__() +def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False): + class MixedPrecisionOps(manual_cast): + _layer_quant_config = layer_quant_config + _compute_dtype = compute_dtype + _full_precision_mm = full_precision_mm - self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype} - # self.factory_kwargs = {"device": device, "dtype": dtype} + class Linear(torch.nn.Module, CastWeightBiasOp): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + super().__init__() - self.in_features = in_features - self.out_features = out_features - if bias: - self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs)) - else: - self.register_parameter("bias", None) + self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype} + # self.factory_kwargs = {"device": device, "dtype": dtype} - self.tensor_class = None + self.in_features = in_features + self.out_features = out_features + if bias: + self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs)) + else: + self.register_parameter("bias", None) - def reset_parameters(self): - return None + self.tensor_class = None + self._full_precision_mm = MixedPrecisionOps._full_precision_mm - def _load_from_state_dict(self, state_dict, prefix, local_metadata, - strict, missing_keys, unexpected_keys, error_msgs): + def reset_parameters(self): + return None - device = self.factory_kwargs["device"] - layer_name = prefix.rstrip('.') - weight_key = f"{prefix}weight" - weight = state_dict.pop(weight_key, None) - if weight is None: - raise ValueError(f"Missing weight for layer {layer_name}") + def _load_from_state_dict(self, state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, error_msgs): - manually_loaded_keys = [weight_key] + device = self.factory_kwargs["device"] + layer_name = prefix.rstrip('.') + weight_key = f"{prefix}weight" + weight = state_dict.pop(weight_key, None) + if weight is None: + raise ValueError(f"Missing weight for layer {layer_name}") - if layer_name not in MixedPrecisionOps._layer_quant_config: - self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False) - else: - quant_format = MixedPrecisionOps._layer_quant_config[layer_name].get("format", None) - if quant_format is None: - raise ValueError(f"Unknown quantization format for layer {layer_name}") + manually_loaded_keys = [weight_key] - qconfig = QUANT_ALGOS[quant_format] - self.layout_type = qconfig["comfy_tensor_layout"] + if layer_name not in MixedPrecisionOps._layer_quant_config: + self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False) + else: + quant_format = MixedPrecisionOps._layer_quant_config[layer_name].get("format", None) + if quant_format is None: + raise ValueError(f"Unknown quantization format for layer {layer_name}") - weight_scale_key = f"{prefix}weight_scale" - layout_params = { - 'scale': state_dict.pop(weight_scale_key, None), - 'orig_dtype': MixedPrecisionOps._compute_dtype, - 'block_size': qconfig.get("group_size", None), - } - if layout_params['scale'] is not None: - manually_loaded_keys.append(weight_scale_key) + qconfig = QUANT_ALGOS[quant_format] + self.layout_type = qconfig["comfy_tensor_layout"] - self.weight = torch.nn.Parameter( - QuantizedTensor(weight.to(device=device), self.layout_type, layout_params), - requires_grad=False - ) + weight_scale_key = f"{prefix}weight_scale" + layout_params = { + 'scale': state_dict.pop(weight_scale_key, None), + 'orig_dtype': MixedPrecisionOps._compute_dtype, + 'block_size': qconfig.get("group_size", None), + } + if layout_params['scale'] is not None: + manually_loaded_keys.append(weight_scale_key) - for param_name in qconfig["parameters"]: - param_key = f"{prefix}{param_name}" - _v = state_dict.pop(param_key, None) - if _v is None: - continue - setattr(self, param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False)) - manually_loaded_keys.append(param_key) + self.weight = torch.nn.Parameter( + QuantizedTensor(weight.to(device=device), self.layout_type, layout_params), + requires_grad=False + ) - super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) + for param_name in qconfig["parameters"]: + param_key = f"{prefix}{param_name}" + _v = state_dict.pop(param_key, None) + if _v is None: + continue + setattr(self, param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False)) + manually_loaded_keys.append(param_key) - for key in manually_loaded_keys: - if key in missing_keys: - missing_keys.remove(key) + super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) - def _forward(self, input, weight, bias): - return torch.nn.functional.linear(input, weight, bias) + for key in manually_loaded_keys: + if key in missing_keys: + missing_keys.remove(key) - def forward_comfy_cast_weights(self, input): - weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) - x = self._forward(input, weight, bias) - uncast_bias_weight(self, weight, bias, offload_stream) - return x + def _forward(self, input, weight, bias): + return torch.nn.functional.linear(input, weight, bias) - def forward(self, input, *args, **kwargs): - run_every_op() + def forward_comfy_cast_weights(self, input): + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = self._forward(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x - if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: - return self.forward_comfy_cast_weights(input, *args, **kwargs) - if (getattr(self, 'layout_type', None) is not None and - getattr(self, 'input_scale', None) is not None and - not isinstance(input, QuantizedTensor)): - input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, dtype=self.weight.dtype) - return self._forward(input, self.weight, self.bias) + def forward(self, input, *args, **kwargs): + run_every_op() + if self._full_precision_mm or self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: + return self.forward_comfy_cast_weights(input, *args, **kwargs) + if (getattr(self, 'layout_type', None) is not None and + getattr(self, 'input_scale', None) is not None and + not isinstance(input, QuantizedTensor)): + input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, dtype=self.weight.dtype) + return self._forward(input, self.weight, self.bias) + return MixedPrecisionOps def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None, model_config=None): if model_config and hasattr(model_config, 'layer_quant_config') and model_config.layer_quant_config: - MixedPrecisionOps._layer_quant_config = model_config.layer_quant_config - MixedPrecisionOps._compute_dtype = compute_dtype logging.info(f"Using mixed precision operations: {len(model_config.layer_quant_config)} quantized layers") - return MixedPrecisionOps + return mixed_precision_ops(model_config.layer_quant_config, compute_dtype) fp8_compute = comfy.model_management.supports_fp8_compute(load_device) if scaled_fp8 is not None: diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index 1d058bece..905b4729e 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -338,6 +338,18 @@ def generic_copy_(func, args, kwargs): return func(*args, **kwargs) +@register_generic_util(torch.ops.aten.to.dtype) +def generic_to_dtype(func, args, kwargs): + """Handle .to(dtype) calls - dtype conversion only.""" + src = args[0] + if isinstance(src, QuantizedTensor): + # For dtype-only conversion, just change the orig_dtype, no real cast is needed + target_dtype = args[1] if len(args) > 1 else kwargs.get('dtype') + src._layout_params["orig_dtype"] = target_dtype + return src + return func(*args, **kwargs) + + @register_generic_util(torch.ops.aten._has_compatible_shallow_copy_type.default) def generic_has_compatible_shallow_copy_type(func, args, kwargs): return True diff --git a/comfy/sd.py b/comfy/sd.py index dc0905ada..b6df0bd61 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -917,7 +917,12 @@ class CLIPType(Enum): def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}): clip_data = [] for p in ckpt_paths: - clip_data.append(comfy.utils.load_torch_file(p, safe_load=True)) + sd, metadata = comfy.utils.load_torch_file(p, safe_load=True, return_metadata=True) + if metadata is not None: + quant_metadata = metadata.get("_quantization_metadata", None) + if quant_metadata is not None: + sd["_quantization_metadata"] = quant_metadata + clip_data.append(sd) return load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options) @@ -1142,6 +1147,8 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip parameters = 0 for c in clip_data: + if "_quantization_metadata" in c: + c.pop("_quantization_metadata") parameters += comfy.utils.calculate_parameters(c) tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 3066de2d7..8f509bab1 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -109,13 +109,23 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): operations = model_options.get("custom_operations", None) scaled_fp8 = None + quantization_metadata = model_options.get("quantization_metadata", None) if operations is None: - scaled_fp8 = model_options.get("scaled_fp8", None) - if scaled_fp8 is not None: - operations = comfy.ops.scaled_fp8_ops(fp8_matrix_mult=False, override_dtype=scaled_fp8) + layer_quant_config = None + if quantization_metadata is not None: + layer_quant_config = json.loads(quantization_metadata).get("layers", None) + + if layer_quant_config is not None: + operations = comfy.ops.mixed_precision_ops(layer_quant_config, dtype, full_precision_mm=True) + logging.info(f"Using MixedPrecisionOps for text encoder: {len(layer_quant_config)} quantized layers") else: - operations = comfy.ops.manual_cast + # Fallback to scaled_fp8_ops for backward compatibility + scaled_fp8 = model_options.get("scaled_fp8", None) + if scaled_fp8 is not None: + operations = comfy.ops.scaled_fp8_ops(fp8_matrix_mult=False, override_dtype=scaled_fp8) + else: + operations = comfy.ops.manual_cast self.operations = operations self.transformer = model_class(config, dtype, device, self.operations) diff --git a/comfy/text_encoders/hunyuan_video.py b/comfy/text_encoders/hunyuan_video.py index 557094f49..0110517bb 100644 --- a/comfy/text_encoders/hunyuan_video.py +++ b/comfy/text_encoders/hunyuan_video.py @@ -18,6 +18,9 @@ def llama_detect(state_dict, prefix=""): if scaled_fp8_key in state_dict: out["llama_scaled_fp8"] = state_dict[scaled_fp8_key].dtype + if "_quantization_metadata" in state_dict: + out["llama_quantization_metadata"] = state_dict["_quantization_metadata"] + return out diff --git a/tests-unit/comfy_quant/test_mixed_precision.py b/tests-unit/comfy_quant/test_mixed_precision.py index f8d1fd04e..63361309f 100644 --- a/tests-unit/comfy_quant/test_mixed_precision.py +++ b/tests-unit/comfy_quant/test_mixed_precision.py @@ -37,11 +37,8 @@ class TestMixedPrecisionOps(unittest.TestCase): def test_all_layers_standard(self): """Test that model with no quantization works normally""" - # Configure no quantization - ops.MixedPrecisionOps._layer_quant_config = {} - # Create model - model = SimpleModel(operations=ops.MixedPrecisionOps) + model = SimpleModel(operations=ops.mixed_precision_ops({})) # Initialize weights manually model.layer1.weight = torch.nn.Parameter(torch.randn(20, 10, dtype=torch.bfloat16)) @@ -76,7 +73,6 @@ class TestMixedPrecisionOps(unittest.TestCase): "params": {} } } - ops.MixedPrecisionOps._layer_quant_config = layer_quant_config # Create state dict with mixed precision fp8_weight1 = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn) @@ -99,7 +95,7 @@ class TestMixedPrecisionOps(unittest.TestCase): } # Create model and load state dict (strict=False because custom loading pops keys) - model = SimpleModel(operations=ops.MixedPrecisionOps) + model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config)) model.load_state_dict(state_dict, strict=False) # Verify weights are wrapped in QuantizedTensor @@ -132,7 +128,6 @@ class TestMixedPrecisionOps(unittest.TestCase): "params": {} } } - ops.MixedPrecisionOps._layer_quant_config = layer_quant_config # Create and load model fp8_weight = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn) @@ -146,7 +141,7 @@ class TestMixedPrecisionOps(unittest.TestCase): "layer3.bias": torch.randn(40, dtype=torch.bfloat16), } - model = SimpleModel(operations=ops.MixedPrecisionOps) + model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config)) model.load_state_dict(state_dict1, strict=False) # Save state dict @@ -170,7 +165,6 @@ class TestMixedPrecisionOps(unittest.TestCase): "params": {} } } - ops.MixedPrecisionOps._layer_quant_config = layer_quant_config # Create and load model fp8_weight = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn) @@ -184,7 +178,7 @@ class TestMixedPrecisionOps(unittest.TestCase): "layer3.bias": torch.randn(40, dtype=torch.bfloat16), } - model = SimpleModel(operations=ops.MixedPrecisionOps) + model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config)) model.load_state_dict(state_dict, strict=False) # Add a weight function (simulating LoRA) @@ -210,7 +204,6 @@ class TestMixedPrecisionOps(unittest.TestCase): "params": {} } } - ops.MixedPrecisionOps._layer_quant_config = layer_quant_config # Create state dict state_dict = { @@ -223,7 +216,7 @@ class TestMixedPrecisionOps(unittest.TestCase): } # Load should raise KeyError for unknown format in QUANT_FORMAT_MIXINS - model = SimpleModel(operations=ops.MixedPrecisionOps) + model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config)) with self.assertRaises(KeyError): model.load_state_dict(state_dict, strict=False) From b6805429b9c2f3aa919035bea849ecd1de3ac8e4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 24 Nov 2025 23:48:20 -0800 Subject: [PATCH 611/759] Allow pinning quantized tensors. (#10873) --- comfy/model_management.py | 6 +++++- comfy/quant_ops.py | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index a21df54b3..a9327ac80 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1098,13 +1098,14 @@ if not args.disable_pinned_memory: MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95 logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024))) +PINNING_ALLOWED_TYPES = set(["Parameter", "QuantizedTensor"]) def pin_memory(tensor): global TOTAL_PINNED_MEMORY if MAX_PINNED_MEMORY <= 0: return False - if type(tensor) is not torch.nn.parameter.Parameter: + if type(tensor).__name__ not in PINNING_ALLOWED_TYPES: return False if not is_device_cpu(tensor.device): @@ -1124,6 +1125,9 @@ def pin_memory(tensor): return False ptr = tensor.data_ptr() + if ptr == 0: + return False + if torch.cuda.cudart().cudaHostRegister(ptr, size, 1) == 0: PINNED_MEMORY[ptr] = size TOTAL_PINNED_MEMORY += size diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index 905b4729e..e938144a7 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -228,6 +228,14 @@ class QuantizedTensor(torch.Tensor): new_kwargs = dequant_arg(kwargs) return func(*new_args, **new_kwargs) + def data_ptr(self): + return self._qdata.data_ptr() + + def is_pinned(self): + return self._qdata.is_pinned() + + def is_contiguous(self): + return self._qdata.is_contiguous() # ============================================================================== # Generic Utilities (Layout-Agnostic Operations) From acfaa5c4a132e1c01bc9d94e76b0d667c899bfd1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 24 Nov 2025 23:55:49 -0800 Subject: [PATCH 612/759] Don't try fp8 matrix mult in quantized ops if not supported by hardware. (#10874) --- comfy/ops.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index af185ec24..785aa1c9f 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -646,11 +646,12 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful return MixedPrecisionOps def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None, model_config=None): + fp8_compute = comfy.model_management.supports_fp8_compute(load_device) # TODO: if we support more ops this needs to be more granular + if model_config and hasattr(model_config, 'layer_quant_config') and model_config.layer_quant_config: logging.info(f"Using mixed precision operations: {len(model_config.layer_quant_config)} quantized layers") - return mixed_precision_ops(model_config.layer_quant_config, compute_dtype) + return mixed_precision_ops(model_config.layer_quant_config, compute_dtype, full_precision_mm=not fp8_compute) - fp8_compute = comfy.model_management.supports_fp8_compute(load_device) if scaled_fp8 is not None: return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8) From 015a0599d08f1072155b9213d488b73e502fea3c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 25 Nov 2025 00:23:19 -0800 Subject: [PATCH 613/759] I found a case where this is needed (#10875) --- comfy/quant_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index e938144a7..0c16bcf8d 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -405,8 +405,8 @@ class TensorCoreFP8Layout(QuantizedLayout): tensor_scaled = tensor * (1.0 / scale).to(tensor.dtype) # TODO: uncomment this if it's actually needed because the clamp has a small performance penality' - # lp_amax = torch.finfo(dtype).max - # torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled) + lp_amax = torch.finfo(dtype).max + torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled) qdata = tensor_scaled.to(dtype, memory_format=torch.contiguous_format) layout_params = { From 6b573ae0cb11000a0330a35d9e31917c22c874a4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 25 Nov 2025 07:50:19 -0800 Subject: [PATCH 614/759] Flux 2 (#10879) --- comfy/latent_formats.py | 9 +++ comfy/ldm/flux/layers.py | 90 +++++++++++++++++++-------- comfy/ldm/flux/model.py | 80 ++++++++++++++++++------ comfy/ldm/models/autoencoder.py | 42 +++++++++++++ comfy/model_base.py | 23 +++++-- comfy/model_detection.py | 50 +++++++++++---- comfy/sd.py | 26 +++++++- comfy/supported_models.py | 34 +++++++++- comfy/text_encoders/flux.py | 107 +++++++++++++++++++++++++++++++- comfy/text_encoders/llama.py | 31 +++++++++ comfy_extras/nodes_flux.py | 80 +++++++++++++++++++++++- nodes.py | 2 +- 12 files changed, 506 insertions(+), 68 deletions(-) diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 204fc048d..e98c7d6d8 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -178,6 +178,15 @@ class Flux(SD3): def process_out(self, latent): return (latent / self.scale_factor) + self.shift_factor +class Flux2(LatentFormat): + latent_channels = 128 + + def process_in(self, latent): + return latent + + def process_out(self, latent): + return latent + class Mochi(LatentFormat): latent_channels = 12 latent_dimensions = 3 diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py index 23150a712..2472ab79c 100644 --- a/comfy/ldm/flux/layers.py +++ b/comfy/ldm/flux/layers.py @@ -48,11 +48,11 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10 return embedding class MLPEmbedder(nn.Module): - def __init__(self, in_dim: int, hidden_dim: int, dtype=None, device=None, operations=None): + def __init__(self, in_dim: int, hidden_dim: int, bias=True, dtype=None, device=None, operations=None): super().__init__() - self.in_layer = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device) + self.in_layer = operations.Linear(in_dim, hidden_dim, bias=bias, dtype=dtype, device=device) self.silu = nn.SiLU() - self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=True, dtype=dtype, device=device) + self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=bias, dtype=dtype, device=device) def forward(self, x: Tensor) -> Tensor: return self.out_layer(self.silu(self.in_layer(x))) @@ -80,14 +80,14 @@ class QKNorm(torch.nn.Module): class SelfAttention(nn.Module): - def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, dtype=None, device=None, operations=None): + def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, proj_bias: bool = True, dtype=None, device=None, operations=None): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device) self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations) - self.proj = operations.Linear(dim, dim, dtype=dtype, device=device) + self.proj = operations.Linear(dim, dim, bias=proj_bias, dtype=dtype, device=device) @dataclass @@ -98,11 +98,11 @@ class ModulationOut: class Modulation(nn.Module): - def __init__(self, dim: int, double: bool, dtype=None, device=None, operations=None): + def __init__(self, dim: int, double: bool, bias=True, dtype=None, device=None, operations=None): super().__init__() self.is_double = double self.multiplier = 6 if double else 3 - self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device) + self.lin = operations.Linear(dim, self.multiplier * dim, bias=bias, dtype=dtype, device=device) def forward(self, vec: Tensor) -> tuple: if vec.ndim == 2: @@ -129,8 +129,18 @@ def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None): return tensor +class SiLUActivation(nn.Module): + def __init__(self): + super().__init__() + self.gate_fn = nn.SiLU() + + def forward(self, x: Tensor) -> Tensor: + x1, x2 = x.chunk(2, dim=-1) + return self.gate_fn(x1) * x2 + + class DoubleStreamBlock(nn.Module): - def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, dtype=None, device=None, operations=None): + def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, dtype=None, device=None, operations=None): super().__init__() mlp_hidden_dim = int(hidden_size * mlp_ratio) @@ -142,27 +152,44 @@ class DoubleStreamBlock(nn.Module): self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations) self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations) + self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations) self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.img_mlp = nn.Sequential( - operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), - nn.GELU(approximate="tanh"), - operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), - ) + + if mlp_silu_act: + self.img_mlp = nn.Sequential( + operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device), + SiLUActivation(), + operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device), + ) + else: + self.img_mlp = nn.Sequential( + operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), + nn.GELU(approximate="tanh"), + operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), + ) if self.modulation: self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations) self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations) + self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations) self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.txt_mlp = nn.Sequential( - operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), - nn.GELU(approximate="tanh"), - operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), - ) + + if mlp_silu_act: + self.txt_mlp = nn.Sequential( + operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device), + SiLUActivation(), + operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device), + ) + else: + self.txt_mlp = nn.Sequential( + operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), + nn.GELU(approximate="tanh"), + operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), + ) + self.flipped_img_txt = flipped_img_txt def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}): @@ -246,6 +273,8 @@ class SingleStreamBlock(nn.Module): mlp_ratio: float = 4.0, qk_scale: float = None, modulation=True, + mlp_silu_act=False, + bias=True, dtype=None, device=None, operations=None @@ -257,17 +286,24 @@ class SingleStreamBlock(nn.Module): self.scale = qk_scale or head_dim**-0.5 self.mlp_hidden_dim = int(hidden_size * mlp_ratio) + + self.mlp_hidden_dim_first = self.mlp_hidden_dim + if mlp_silu_act: + self.mlp_hidden_dim_first = int(hidden_size * mlp_ratio * 2) + self.mlp_act = SiLUActivation() + else: + self.mlp_act = nn.GELU(approximate="tanh") + # qkv and mlp_in - self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device) + self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim_first, bias=bias, dtype=dtype, device=device) # proj and mlp_out - self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device) + self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, bias=bias, dtype=dtype, device=device) self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations) self.hidden_size = hidden_size self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.mlp_act = nn.GELU(approximate="tanh") if modulation: self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations) else: @@ -279,7 +315,7 @@ class SingleStreamBlock(nn.Module): else: mod = vec - qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) + qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim_first], dim=-1) q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) del qkv @@ -298,11 +334,11 @@ class SingleStreamBlock(nn.Module): class LastLayer(nn.Module): - def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None): + def __init__(self, hidden_size: int, patch_size: int, out_channels: int, bias=True, dtype=None, device=None, operations=None): super().__init__() self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device) - self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device)) + self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=bias, dtype=dtype, device=device) + self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=bias, dtype=dtype, device=device)) def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor: if vec.ndim == 2: diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index b9d36f202..1a24e6d95 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -15,6 +15,7 @@ from .layers import ( MLPEmbedder, SingleStreamBlock, timestep_embedding, + Modulation ) @dataclass @@ -33,6 +34,11 @@ class FluxParams: patch_size: int qkv_bias: bool guidance_embed: bool + global_modulation: bool = False + mlp_silu_act: bool = False + ops_bias: bool = True + default_ref_method: str = "offset" + ref_index_scale: float = 1.0 class Flux(nn.Module): @@ -58,13 +64,17 @@ class Flux(nn.Module): self.hidden_size = params.hidden_size self.num_heads = params.num_heads self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim) - self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device) - self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) - self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations) + self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device) + self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations) + if params.vec_in_dim is not None: + self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations) + else: + self.vector_in = None + self.guidance_in = ( - MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity() + MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity() ) - self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device) + self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device) self.double_blocks = nn.ModuleList( [ @@ -73,6 +83,9 @@ class Flux(nn.Module): self.num_heads, mlp_ratio=params.mlp_ratio, qkv_bias=params.qkv_bias, + modulation=params.global_modulation is False, + mlp_silu_act=params.mlp_silu_act, + proj_bias=params.ops_bias, dtype=dtype, device=device, operations=operations ) for _ in range(params.depth) @@ -81,13 +94,30 @@ class Flux(nn.Module): self.single_blocks = nn.ModuleList( [ - SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations) + SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=params.global_modulation is False, mlp_silu_act=params.mlp_silu_act, bias=params.ops_bias, dtype=dtype, device=device, operations=operations) for _ in range(params.depth_single_blocks) ] ) if final_layer: - self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations) + self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, bias=params.ops_bias, dtype=dtype, device=device, operations=operations) + + if params.global_modulation: + self.double_stream_modulation_img = Modulation( + self.hidden_size, + double=True, + bias=False, + dtype=dtype, device=device, operations=operations + ) + self.double_stream_modulation_txt = Modulation( + self.hidden_size, + double=True, + bias=False, + dtype=dtype, device=device, operations=operations + ) + self.single_stream_modulation = Modulation( + self.hidden_size, double=False, bias=False, dtype=dtype, device=device, operations=operations + ) def forward_orig( self, @@ -103,9 +133,6 @@ class Flux(nn.Module): attn_mask: Tensor = None, ) -> Tensor: - if y is None: - y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype) - patches = transformer_options.get("patches", {}) patches_replace = transformer_options.get("patches_replace", {}) if img.ndim != 3 or txt.ndim != 3: @@ -118,9 +145,17 @@ class Flux(nn.Module): if guidance is not None: vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype)) - vec = vec + self.vector_in(y[:, :self.params.vec_in_dim]) + if self.vector_in is not None: + if y is None: + y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype) + vec = vec + self.vector_in(y[:, :self.params.vec_in_dim]) + txt = self.txt_in(txt) + vec_orig = vec + if self.params.global_modulation: + vec = (self.double_stream_modulation_img(vec_orig), self.double_stream_modulation_txt(vec_orig)) + if "post_input" in patches: for p in patches["post_input"]: out = p({"img": img, "txt": txt, "img_ids": img_ids, "txt_ids": txt_ids}) @@ -177,6 +212,9 @@ class Flux(nn.Module): img = torch.cat((txt, img), 1) + if self.params.global_modulation: + vec, _ = self.single_stream_modulation(vec_orig) + for i, block in enumerate(self.single_blocks): if ("single_block", i) in blocks_replace: def block_wrap(args): @@ -207,7 +245,7 @@ class Flux(nn.Module): img = img[:, txt.shape[1] :, ...] - img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels) + img = self.final_layer(img, vec_orig) # (N, T, patch_size ** 2 * out_channels) return img def process_img(self, x, index=0, h_offset=0, w_offset=0, transformer_options={}): @@ -234,10 +272,10 @@ class Flux(nn.Module): h_offset += rope_options.get("shift_y", 0.0) w_offset += rope_options.get("shift_x", 0.0) - img_ids = torch.zeros((steps_h, steps_w, 3), device=x.device, dtype=x.dtype) + img_ids = torch.zeros((steps_h, steps_w, len(self.params.axes_dim)), device=x.device, dtype=torch.float32) img_ids[:, :, 0] = img_ids[:, :, 1] + index - img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=x.dtype).unsqueeze(1) - img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=x.dtype).unsqueeze(0) + img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=torch.float32).unsqueeze(1) + img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=torch.float32).unsqueeze(0) return img, repeat(img_ids, "h w c -> b (h w) c", b=bs) def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs): @@ -259,10 +297,10 @@ class Flux(nn.Module): h = 0 w = 0 index = 0 - ref_latents_method = kwargs.get("ref_latents_method", "offset") + ref_latents_method = kwargs.get("ref_latents_method", self.params.default_ref_method) for ref in ref_latents: if ref_latents_method == "index": - index += 1 + index += self.params.ref_index_scale h_offset = 0 w_offset = 0 elif ref_latents_method == "uxo": @@ -286,7 +324,11 @@ class Flux(nn.Module): img = torch.cat([img, kontext], dim=1) img_ids = torch.cat([img_ids, kontext_ids], dim=1) - txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype) + txt_ids = torch.zeros((bs, context.shape[1], len(self.params.axes_dim)), device=x.device, dtype=torch.float32) + + if len(self.params.axes_dim) == 4: # Flux 2 + txt_ids[:, :, 3] = torch.linspace(0, context.shape[1] - 1, steps=context.shape[1], device=x.device, dtype=torch.float32) + out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None)) out = out[:, :img_tokens] - return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h_orig,:w_orig] + return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=self.patch_size, pw=self.patch_size)[:,:,:h_orig,:w_orig] diff --git a/comfy/ldm/models/autoencoder.py b/comfy/ldm/models/autoencoder.py index 611d36a1b..4f50810dc 100644 --- a/comfy/ldm/models/autoencoder.py +++ b/comfy/ldm/models/autoencoder.py @@ -9,6 +9,8 @@ from comfy.ldm.modules.distributions.distributions import DiagonalGaussianDistri from comfy.ldm.util import get_obj_from_str, instantiate_from_config from comfy.ldm.modules.ema import LitEma import comfy.ops +from einops import rearrange +import comfy.model_management class DiagonalGaussianRegularizer(torch.nn.Module): def __init__(self, sample: bool = False): @@ -179,6 +181,21 @@ class AutoencodingEngineLegacy(AutoencodingEngine): self.post_quant_conv = conv_op(embed_dim, ddconfig["z_channels"], 1) self.embed_dim = embed_dim + if ddconfig.get("batch_norm_latent", False): + self.bn_eps = 1e-4 + self.bn_momentum = 0.1 + self.ps = [2, 2] + self.bn = torch.nn.BatchNorm2d(math.prod(self.ps) * ddconfig["z_channels"], + eps=self.bn_eps, + momentum=self.bn_momentum, + affine=False, + track_running_stats=True, + ) + self.bn.eval() + else: + self.bn = None + + def get_autoencoder_params(self) -> list: params = super().get_autoencoder_params() return params @@ -201,11 +218,36 @@ class AutoencodingEngineLegacy(AutoencodingEngine): z = torch.cat(z, 0) z, reg_log = self.regularization(z) + + if self.bn is not None: + z = rearrange(z, + "... c (i pi) (j pj) -> ... (c pi pj) i j", + pi=self.ps[0], + pj=self.ps[1], + ) + + z = torch.nn.functional.batch_norm(z, + comfy.model_management.cast_to(self.bn.running_mean, dtype=z.dtype, device=z.device), + comfy.model_management.cast_to(self.bn.running_var, dtype=z.dtype, device=z.device), + momentum=self.bn_momentum, + eps=self.bn_eps) + if return_reg_log: return z, reg_log return z def decode(self, z: torch.Tensor, **decoder_kwargs) -> torch.Tensor: + if self.bn is not None: + s = torch.sqrt(comfy.model_management.cast_to(self.bn.running_var.view(1, -1, 1, 1), dtype=z.dtype, device=z.device) + self.bn_eps) + m = comfy.model_management.cast_to(self.bn.running_mean.view(1, -1, 1, 1), dtype=z.dtype, device=z.device) + z = z * s + m + z = rearrange( + z, + "... (c pi pj) i j -> ... c (i pi) (j pj)", + pi=self.ps[0], + pj=self.ps[1], + ) + if self.max_batch_size is None: dec = self.post_quant_conv(z) dec = self.decoder(dec, **decoder_kwargs) diff --git a/comfy/model_base.py b/comfy/model_base.py index e14b552c5..cad79ecbd 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -898,12 +898,13 @@ class Flux(BaseModel): attention_mask = kwargs.get("attention_mask", None) if attention_mask is not None: shape = kwargs["noise"].shape - mask_ref_size = kwargs["attention_mask_img_shape"] - # the model will pad to the patch size, and then divide - # essentially dividing and rounding up - (h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size)) - attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok)) - out['attention_mask'] = comfy.conds.CONDRegular(attention_mask) + mask_ref_size = kwargs.get("attention_mask_img_shape", None) + if mask_ref_size is not None: + # the model will pad to the patch size, and then divide + # essentially dividing and rounding up + (h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size)) + attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok)) + out['attention_mask'] = comfy.conds.CONDRegular(attention_mask) guidance = kwargs.get("guidance", 3.5) if guidance is not None: @@ -928,6 +929,16 @@ class Flux(BaseModel): out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16]) return out +class Flux2(Flux): + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + target_text_len = 512 + if cross_attn.shape[1] < target_text_len: + cross_attn = torch.nn.functional.pad(cross_attn, (0, 0, target_text_len - cross_attn.shape[1], 0)) + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + return out class GenmoMochi(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 0131ca25a..b2ba1459d 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -200,26 +200,54 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight) dit_config = {} - dit_config["image_model"] = "flux" + if '{}double_stream_modulation_img.lin.weight'.format(key_prefix) in state_dict_keys: + dit_config["image_model"] = "flux2" + dit_config["axes_dim"] = [32, 32, 32, 32] + dit_config["num_heads"] = 48 + dit_config["mlp_ratio"] = 3.0 + dit_config["theta"] = 2000 + dit_config["out_channels"] = 128 + dit_config["global_modulation"] = True + dit_config["vec_in_dim"] = None + dit_config["mlp_silu_act"] = True + dit_config["qkv_bias"] = False + dit_config["ops_bias"] = False + dit_config["default_ref_method"] = "index" + dit_config["ref_index_scale"] = 10.0 + patch_size = 1 + else: + dit_config["image_model"] = "flux" + dit_config["axes_dim"] = [16, 56, 56] + dit_config["num_heads"] = 24 + dit_config["mlp_ratio"] = 4.0 + dit_config["theta"] = 10000 + dit_config["out_channels"] = 16 + dit_config["qkv_bias"] = True + patch_size = 2 + dit_config["in_channels"] = 16 - patch_size = 2 + dit_config["hidden_size"] = 3072 + dit_config["context_in_dim"] = 4096 + dit_config["patch_size"] = patch_size in_key = "{}img_in.weight".format(key_prefix) if in_key in state_dict_keys: - dit_config["in_channels"] = state_dict[in_key].shape[1] // (patch_size * patch_size) - dit_config["out_channels"] = 16 + w = state_dict[in_key] + dit_config["in_channels"] = w.shape[1] // (patch_size * patch_size) + dit_config["hidden_size"] = w.shape[0] + + txt_in_key = "{}txt_in.weight".format(key_prefix) + if txt_in_key in state_dict_keys: + w = state_dict[txt_in_key] + dit_config["context_in_dim"] = w.shape[1] + dit_config["hidden_size"] = w.shape[0] + vec_in_key = '{}vector_in.in_layer.weight'.format(key_prefix) if vec_in_key in state_dict_keys: dit_config["vec_in_dim"] = state_dict[vec_in_key].shape[1] - dit_config["context_in_dim"] = 4096 - dit_config["hidden_size"] = 3072 - dit_config["mlp_ratio"] = 4.0 - dit_config["num_heads"] = 24 + dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.') dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.') - dit_config["axes_dim"] = [16, 56, 56] - dit_config["theta"] = 10000 - dit_config["qkv_bias"] = True if '{}distilled_guidance_layer.0.norms.0.scale'.format(key_prefix) in state_dict_keys or '{}distilled_guidance_layer.norms.0.scale'.format(key_prefix) in state_dict_keys: #Chroma dit_config["image_model"] = "chroma" dit_config["in_channels"] = 64 diff --git a/comfy/sd.py b/comfy/sd.py index b6df0bd61..14dd8944c 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -356,7 +356,7 @@ class VAE: self.memory_used_encode = lambda shape, dtype: (700 * shape[2] * shape[3]) * model_management.dtype_size(dtype) self.memory_used_decode = lambda shape, dtype: (700 * shape[2] * shape[3] * 32 * 32) * model_management.dtype_size(dtype) - elif sd['decoder.conv_in.weight'].shape[1] == 32: + elif sd['decoder.conv_in.weight'].shape[1] == 32 and sd['decoder.conv_in.weight'].ndim == 5: ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True, "refiner_vae": False} self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] @@ -382,6 +382,17 @@ class VAE: self.upscale_ratio = 4 self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] + if 'decoder.post_quant_conv.weight' in sd: + sd = comfy.utils.state_dict_prefix_replace(sd, {"decoder.post_quant_conv.": "post_quant_conv.", "encoder.quant_conv.": "quant_conv."}) + + if 'bn.running_mean' in sd: + ddconfig["batch_norm_latent"] = True + self.downscale_ratio *= 2 + self.upscale_ratio *= 2 + self.latent_channels *= 4 + old_memory_used_decode = self.memory_used_decode + self.memory_used_decode = lambda shape, dtype: old_memory_used_decode(shape, dtype) * 4.0 + if 'post_quant_conv.weight' in sd: self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1]) else: @@ -940,6 +951,8 @@ class TEModel(Enum): QWEN25_7B = 11 BYT5_SMALL_GLYPH = 12 GEMMA_3_4B = 13 + MISTRAL3_24B = 14 + MISTRAL3_24B_PRUNED_FLUX2 = 15 def detect_te_model(sd): if "text_model.encoder.layers.30.mlp.fc1.weight" in sd: @@ -972,6 +985,13 @@ def detect_te_model(sd): if weight.shape[0] == 512: return TEModel.QWEN25_7B if "model.layers.0.post_attention_layernorm.weight" in sd: + weight = sd['model.layers.0.post_attention_layernorm.weight'] + if weight.shape[0] == 5120: + if "model.layers.39.post_attention_layernorm.weight" in sd: + return TEModel.MISTRAL3_24B + else: + return TEModel.MISTRAL3_24B_PRUNED_FLUX2 + return TEModel.LLAMA3_8 return None @@ -1086,6 +1106,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip else: clip_target.clip = comfy.text_encoders.qwen_image.te(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.qwen_image.QwenImageTokenizer + elif te_model == TEModel.MISTRAL3_24B or te_model == TEModel.MISTRAL3_24B_PRUNED_FLUX2: + clip_target.clip = comfy.text_encoders.flux.flux2_te(**llama_detect(clip_data), pruned=te_model == TEModel.MISTRAL3_24B_PRUNED_FLUX2) + clip_target.tokenizer = comfy.text_encoders.flux.Flux2Tokenizer + tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None) else: # clip_l if clip_type == CLIPType.SD3: diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 2e64b85e8..8fe8e63f6 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -741,6 +741,37 @@ class FluxSchnell(Flux): out = model_base.Flux(self, model_type=model_base.ModelType.FLOW, device=device) return out +class Flux2(Flux): + unet_config = { + "image_model": "flux2", + } + + sampling_settings = { + "shift": 2.02, + } + + unet_extra_config = {} + latent_format = latent_formats.Flux2 + + supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] + + vae_key_prefix = ["vae."] + text_encoder_key_prefix = ["text_encoders."] + + def __init__(self, unet_config): + super().__init__(unet_config) + self.memory_usage_factor = self.memory_usage_factor * (2.0 * 2.0) * 2.36 + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.Flux2(self, device=device) + return out + + def clip_target(self, state_dict={}): + return None # TODO + pref = self.text_encoder_key_prefix[0] + t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref)) + return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect)) + class GenmoMochi(supported_models_base.BASE): unet_config = { "image_model": "mochi_preview", @@ -1422,6 +1453,7 @@ class HunyuanVideo15_SR_Distilled(HunyuanVideo): hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2] + models += [SVD_img2vid] diff --git a/comfy/text_encoders/flux.py b/comfy/text_encoders/flux.py index d61ef6668..8dbbca16e 100644 --- a/comfy/text_encoders/flux.py +++ b/comfy/text_encoders/flux.py @@ -1,10 +1,13 @@ from comfy import sd1_clip import comfy.text_encoders.t5 import comfy.text_encoders.sd3_clip +import comfy.text_encoders.llama import comfy.model_management -from transformers import T5TokenizerFast +from transformers import T5TokenizerFast, LlamaTokenizerFast import torch import os +import json +import base64 class T5XXLTokenizer(sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): @@ -68,3 +71,105 @@ def flux_clip(dtype_t5=None, t5xxl_scaled_fp8=None): model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 super().__init__(dtype_t5=dtype_t5, device=device, dtype=dtype, model_options=model_options) return FluxClipModel_ + +def load_mistral_tokenizer(data): + if torch.is_tensor(data): + data = data.numpy().tobytes() + + try: + from transformers.integrations.mistral import MistralConverter + except ModuleNotFoundError: + from transformers.models.pixtral.convert_pixtral_weights_to_hf import MistralConverter + + mistral_vocab = json.loads(data) + + special_tokens = {} + vocab = {} + + max_vocab = mistral_vocab["config"]["default_vocab_size"] + + for w in mistral_vocab["vocab"]: + r = w["rank"] + if r >= max_vocab: + continue + + vocab[base64.b64decode(w["token_bytes"])] = r + + for w in mistral_vocab["special_tokens"]: + if "token_bytes" in w: + special_tokens[base64.b64decode(w["token_bytes"])] = w["rank"] + else: + special_tokens[w["token_str"]] = w["rank"] + + all_special = [] + for v in special_tokens: + all_special.append(v) + + special_tokens.update(vocab) + vocab = special_tokens + return {"tokenizer_object": MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted(), "legacy": False} + +class MistralTokenizerClass: + @staticmethod + def from_pretrained(path, **kwargs): + return LlamaTokenizerFast(**kwargs) + +class Mistral3Tokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + self.tekken_data = tokenizer_data.get("tekken_model", None) + super().__init__("", pad_with_end=False, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, max_length=99999999, min_length=1, pad_left=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data) + + def state_dict(self): + return {"tekken_model": self.tekken_data} + +class Flux2Tokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="mistral3_24b", tokenizer=Mistral3Tokenizer) + self.llama_template = '[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]{}[/INST]' + + def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs): + if llama_template is None: + llama_text = self.llama_template.format(text) + else: + llama_text = llama_template.format(text) + + tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs) + return tokens + +class Mistral3_24BModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="all", layer_idx=None, dtype=None, attention_mask=True, model_options={}): + textmodel_json_config = {} + num_layers = model_options.get("num_layers", None) + if num_layers is not None: + textmodel_json_config["num_hidden_layers"] = num_layers + if num_layers < 40: + textmodel_json_config["final_norm"] = False + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 1, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Mistral3Small24B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) + +class Flux2TEModel(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}, name="mistral3_24b", clip_model=Mistral3_24BModel): + super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options) + + def encode_token_weights(self, token_weight_pairs): + out, pooled, extra = super().encode_token_weights(token_weight_pairs) + + out = torch.stack((out[:, 10], out[:, 20], out[:, 30]), dim=1) + out = out.movedim(1, 2) + out = out.reshape(out.shape[0], out.shape[1], -1) + return out, pooled, extra + +def flux2_te(dtype_llama=None, llama_scaled_fp8=None, llama_quantization_metadata=None, pruned=False): + class Flux2TEModel_(Flux2TEModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: + model_options = model_options.copy() + model_options["scaled_fp8"] = llama_scaled_fp8 + if dtype_llama is not None: + dtype = dtype_llama + if llama_quantization_metadata is not None: + model_options["quantization_metadata"] = llama_quantization_metadata + if pruned: + model_options = model_options.copy() + model_options["num_layers"] = 30 + super().__init__(device=device, dtype=dtype, model_options=model_options) + return Flux2TEModel_ diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index feb44bbb0..749ff581b 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -34,6 +34,28 @@ class Llama2Config: rope_scale = None final_norm: bool = True +@dataclass +class Mistral3Small24BConfig: + vocab_size: int = 131072 + hidden_size: int = 5120 + intermediate_size: int = 32768 + num_hidden_layers: int = 40 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + max_position_embeddings: int = 8192 + rms_norm_eps: float = 1e-5 + rope_theta: float = 1000000000.0 + transformer_type: str = "llama" + head_dim = 128 + rms_norm_add = False + mlp_activation = "silu" + qkv_bias = False + rope_dims = None + q_norm = None + k_norm = None + rope_scale = None + final_norm: bool = True + @dataclass class Qwen25_3BConfig: vocab_size: int = 151936 @@ -465,6 +487,15 @@ class Llama2(BaseLlama, torch.nn.Module): self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) self.dtype = dtype +class Mistral3Small24B(BaseLlama, torch.nn.Module): + def __init__(self, config_dict, dtype, device, operations): + super().__init__() + config = Mistral3Small24BConfig(**config_dict) + self.num_layers = config.num_hidden_layers + + self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) + self.dtype = dtype + class Qwen25_3B(BaseLlama, torch.nn.Module): def __init__(self, config_dict, dtype, device, operations): super().__init__() diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py index ce1b2e89f..d9c4bba81 100644 --- a/comfy_extras/nodes_flux.py +++ b/comfy_extras/nodes_flux.py @@ -2,7 +2,10 @@ import node_helpers import comfy.utils from typing_extensions import override from comfy_api.latest import ComfyExtension, io - +import comfy.model_management +import torch +import math +import nodes class CLIPTextEncodeFlux(io.ComfyNode): @classmethod @@ -30,6 +33,27 @@ class CLIPTextEncodeFlux(io.ComfyNode): encode = execute # TODO: remove +class EmptyFlux2LatentImage(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="EmptyFlux2LatentImage", + display_name="Empty Flux 2 Latent", + category="latent", + inputs=[ + io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(), + ], + ) + + @classmethod + def execute(cls, width, height, batch_size=1) -> io.NodeOutput: + latent = torch.zeros([batch_size, 128, height // 16, width // 16], device=comfy.model_management.intermediate_device()) + return io.NodeOutput({"samples": latent}) class FluxGuidance(io.ComfyNode): @classmethod @@ -154,6 +178,58 @@ class FluxKontextMultiReferenceLatentMethod(io.ComfyNode): append = execute # TODO: remove +def generalized_time_snr_shift(t, mu: float, sigma: float): + return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) + + +def compute_empirical_mu(image_seq_len: int, num_steps: int) -> float: + a1, b1 = 8.73809524e-05, 1.89833333 + a2, b2 = 0.00016927, 0.45666666 + + if image_seq_len > 4300: + mu = a2 * image_seq_len + b2 + return float(mu) + + m_200 = a2 * image_seq_len + b2 + m_10 = a1 * image_seq_len + b1 + + a = (m_200 - m_10) / 190.0 + b = m_200 - 200.0 * a + mu = a * num_steps + b + + return float(mu) + + +def get_schedule(num_steps: int, image_seq_len: int) -> list[float]: + mu = compute_empirical_mu(image_seq_len, num_steps) + timesteps = torch.linspace(1, 0, num_steps + 1) + timesteps = generalized_time_snr_shift(timesteps, mu, 1.0) + return timesteps + + +class Flux2Scheduler(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="Flux2Scheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Int.Input("steps", default=20, min=1, max=4096), + io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=1), + io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=1), + ], + outputs=[ + io.Sigmas.Output(), + ], + ) + + @classmethod + def execute(cls, steps, width, height) -> io.NodeOutput: + seq_len = (width * height / (16 * 16)) + sigmas = get_schedule(steps, round(seq_len)) + return io.NodeOutput(sigmas) + + class FluxExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: @@ -163,6 +239,8 @@ class FluxExtension(ComfyExtension): FluxDisableGuidance, FluxKontextImageScale, FluxKontextMultiReferenceLatentMethod, + EmptyFlux2LatentImage, + Flux2Scheduler, ] diff --git a/nodes.py b/nodes.py index f023ae3b6..f4835c02e 100644 --- a/nodes.py +++ b/nodes.py @@ -929,7 +929,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), From 5c7b08ca58f5412b3a814b374793cacdb5b5f0a7 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 25 Nov 2025 18:09:07 +0200 Subject: [PATCH 615/759] [API Nodes] add Flux.2 Pro node (#10880) --- comfy_api_nodes/apis/bfl_api.py | 28 +++- comfy_api_nodes/nodes_bfl.py | 238 +++++++++++++++---------------- comfy_api_nodes/util/__init__.py | 2 + 3 files changed, 143 insertions(+), 125 deletions(-) diff --git a/comfy_api_nodes/apis/bfl_api.py b/comfy_api_nodes/apis/bfl_api.py index 0fc8c0607..d8d3557b3 100644 --- a/comfy_api_nodes/apis/bfl_api.py +++ b/comfy_api_nodes/apis/bfl_api.py @@ -70,6 +70,29 @@ class BFLFluxProGenerateRequest(BaseModel): # ) +class Flux2ProGenerateRequest(BaseModel): + prompt: str = Field(...) + width: int = Field(1024, description="Must be a multiple of 32.") + height: int = Field(768, description="Must be a multiple of 32.") + seed: int | None = Field(None) + prompt_upsampling: bool | None = Field(None) + input_image: str | None = Field(None, description="Base64 encoded image for image-to-image generation") + input_image_2: str | None = Field(None, description="Base64 encoded image for image-to-image generation") + input_image_3: str | None = Field(None, description="Base64 encoded image for image-to-image generation") + input_image_4: str | None = Field(None, description="Base64 encoded image for image-to-image generation") + input_image_5: str | None = Field(None, description="Base64 encoded image for image-to-image generation") + input_image_6: str | None = Field(None, description="Base64 encoded image for image-to-image generation") + input_image_7: str | None = Field(None, description="Base64 encoded image for image-to-image generation") + input_image_8: str | None = Field(None, description="Base64 encoded image for image-to-image generation") + input_image_9: str | None = Field(None, description="Base64 encoded image for image-to-image generation") + safety_tolerance: int | None = Field( + 5, description="Tolerance level for input and output moderation. Value 0 being most strict.", ge=0, le=5 + ) + output_format: str | None = Field( + "png", description="Output format for the generated image. Can be 'jpeg' or 'png'." + ) + + class BFLFluxKontextProGenerateRequest(BaseModel): prompt: str = Field(..., description='The text prompt for what you wannt to edit.') input_image: Optional[str] = Field(None, description='Image to edit in base64 format') @@ -109,8 +132,9 @@ class BFLFluxProUltraGenerateRequest(BaseModel): class BFLFluxProGenerateResponse(BaseModel): - id: str = Field(..., description='The unique identifier for the generation task.') - polling_url: str = Field(..., description='URL to poll for the generation result.') + id: str = Field(..., description="The unique identifier for the generation task.") + polling_url: str = Field(..., description="URL to poll for the generation result.") + cost: float | None = Field(None, description="Price in cents") class BFLStatus(str, Enum): diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py index 1740fb377..8826dea0c 100644 --- a/comfy_api_nodes/nodes_bfl.py +++ b/comfy_api_nodes/nodes_bfl.py @@ -1,7 +1,7 @@ from inspect import cleandoc -from typing import Optional import torch +from pydantic import BaseModel from typing_extensions import override from comfy_api.latest import IO, ComfyExtension @@ -9,15 +9,16 @@ from comfy_api_nodes.apis.bfl_api import ( BFLFluxExpandImageRequest, BFLFluxFillImageRequest, BFLFluxKontextProGenerateRequest, - BFLFluxProGenerateRequest, BFLFluxProGenerateResponse, BFLFluxProUltraGenerateRequest, BFLFluxStatusResponse, BFLStatus, + Flux2ProGenerateRequest, ) from comfy_api_nodes.util import ( ApiEndpoint, download_url_to_image_tensor, + get_number_of_images, poll_op, resize_mask_to_image, sync_op, @@ -116,7 +117,7 @@ class FluxProUltraImageNode(IO.ComfyNode): prompt_upsampling: bool = False, raw: bool = False, seed: int = 0, - image_prompt: Optional[torch.Tensor] = None, + image_prompt: torch.Tensor | None = None, image_prompt_strength: float = 0.1, ) -> IO.NodeOutput: if image_prompt is None: @@ -230,7 +231,7 @@ class FluxKontextProImageNode(IO.ComfyNode): aspect_ratio: str, guidance: float, steps: int, - input_image: Optional[torch.Tensor] = None, + input_image: torch.Tensor | None = None, seed=0, prompt_upsampling=False, ) -> IO.NodeOutput: @@ -280,124 +281,6 @@ class FluxKontextMaxImageNode(FluxKontextProImageNode): DISPLAY_NAME = "Flux.1 Kontext [max] Image" -class FluxProImageNode(IO.ComfyNode): - """ - Generates images synchronously based on prompt and resolution. - """ - - @classmethod - def define_schema(cls) -> IO.Schema: - return IO.Schema( - node_id="FluxProImageNode", - display_name="Flux 1.1 [pro] Image", - category="api node/image/BFL", - description=cleandoc(cls.__doc__ or ""), - inputs=[ - IO.String.Input( - "prompt", - multiline=True, - default="", - tooltip="Prompt for the image generation", - ), - IO.Boolean.Input( - "prompt_upsampling", - default=False, - tooltip="Whether to perform upsampling on the prompt. " - "If active, automatically modifies the prompt for more creative generation, " - "but results are nondeterministic (same seed will not produce exactly the same result).", - ), - IO.Int.Input( - "width", - default=1024, - min=256, - max=1440, - step=32, - ), - IO.Int.Input( - "height", - default=768, - min=256, - max=1440, - step=32, - ), - IO.Int.Input( - "seed", - default=0, - min=0, - max=0xFFFFFFFFFFFFFFFF, - control_after_generate=True, - tooltip="The random seed used for creating the noise.", - ), - IO.Image.Input( - "image_prompt", - optional=True, - ), - # "image_prompt_strength": ( - # IO.FLOAT, - # { - # "default": 0.1, - # "min": 0.0, - # "max": 1.0, - # "step": 0.01, - # "tooltip": "Blend between the prompt and the image prompt.", - # }, - # ), - ], - outputs=[IO.Image.Output()], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], - is_api_node=True, - ) - - @classmethod - async def execute( - cls, - prompt: str, - prompt_upsampling, - width: int, - height: int, - seed=0, - image_prompt=None, - # image_prompt_strength=0.1, - ) -> IO.NodeOutput: - image_prompt = image_prompt if image_prompt is None else tensor_to_base64_string(image_prompt) - initial_response = await sync_op( - cls, - ApiEndpoint( - path="/proxy/bfl/flux-pro-1.1/generate", - method="POST", - ), - response_model=BFLFluxProGenerateResponse, - data=BFLFluxProGenerateRequest( - prompt=prompt, - prompt_upsampling=prompt_upsampling, - width=width, - height=height, - seed=seed, - image_prompt=image_prompt, - ), - ) - response = await poll_op( - cls, - ApiEndpoint(initial_response.polling_url), - response_model=BFLFluxStatusResponse, - status_extractor=lambda r: r.status, - progress_extractor=lambda r: r.progress, - completed_statuses=[BFLStatus.ready], - failed_statuses=[ - BFLStatus.request_moderated, - BFLStatus.content_moderated, - BFLStatus.error, - BFLStatus.task_not_found, - ], - queued_statuses=[], - ) - return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) - - class FluxProExpandNode(IO.ComfyNode): """ Outpaints image based on prompt. @@ -640,16 +523,125 @@ class FluxProFillNode(IO.ComfyNode): return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) +class Flux2ProImageNode(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Flux2ProImageNode", + display_name="Flux.2 [pro] Image", + category="api node/image/BFL", + description="Generates images synchronously based on prompt and resolution.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation or edit", + ), + IO.Int.Input( + "width", + default=1024, + min=256, + max=2048, + step=32, + ), + IO.Int.Input( + "height", + default=768, + min=256, + max=2048, + step=32, + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + ), + IO.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. " + "If active, automatically modifies the prompt for more creative generation, " + "but results are nondeterministic (same seed will not produce exactly the same result).", + ), + IO.Image.Input("images", optional=True, tooltip="Up to 4 images to be used as references."), + ], + outputs=[IO.Image.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + prompt: str, + width: int, + height: int, + seed: int, + prompt_upsampling: bool, + images: torch.Tensor | None = None, + ) -> IO.NodeOutput: + reference_images = {} + if images is not None: + if get_number_of_images(images) > 9: + raise ValueError("The current maximum number of supported images is 9.") + for image_index in range(images.shape[0]): + key_name = f"input_image_{image_index + 1}" if image_index else "input_image" + reference_images[key_name] = tensor_to_base64_string(images[image_index], total_pixels=2048 * 2048) + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/bfl/flux-2-pro/generate", method="POST"), + response_model=BFLFluxProGenerateResponse, + data=Flux2ProGenerateRequest( + prompt=prompt, + width=width, + height=height, + seed=seed, + prompt_upsampling=prompt_upsampling, + **reference_images, + ), + ) + + def price_extractor(_r: BaseModel) -> float | None: + return None if initial_response.cost is None else initial_response.cost / 100 + + response = await poll_op( + cls, + ApiEndpoint(initial_response.polling_url), + response_model=BFLFluxStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + price_extractor=price_extractor, + completed_statuses=[BFLStatus.ready], + failed_statuses=[ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + BFLStatus.error, + BFLStatus.task_not_found, + ], + queued_statuses=[], + ) + return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) + + class BFLExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ FluxProUltraImageNode, - # FluxProImageNode, FluxKontextProImageNode, FluxKontextMaxImageNode, FluxProExpandNode, FluxProFillNode, + Flux2ProImageNode, ] diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py index 21013b591..80292fb3c 100644 --- a/comfy_api_nodes/util/__init__.py +++ b/comfy_api_nodes/util/__init__.py @@ -36,6 +36,7 @@ from .upload_helpers import ( upload_video_to_comfyapi, ) from .validation_utils import ( + get_image_dimensions, get_number_of_images, validate_aspect_ratio_string, validate_audio_duration, @@ -82,6 +83,7 @@ __all__ = [ "trim_video", "video_to_base64_string", # Validation utilities + "get_image_dimensions", "get_number_of_images", "validate_aspect_ratio_string", "validate_audio_duration", From af81cb962d9dd283ddb551962cc223b5a186a1ce Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 25 Nov 2025 08:40:32 -0800 Subject: [PATCH 616/759] Add Flux 2 support to README. (#10882) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 28beec427..b9300ab07 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/) - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/) - [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/) + - [Flux 2](https://comfyanonymous.github.io/ComfyUI_examples/flux2/) - Image Editing Models - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/) - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model) From 828b1b9953175b6df79459f417d1032869d0b46a Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 25 Nov 2025 12:40:58 -0500 Subject: [PATCH 617/759] ComfyUI version v0.3.72 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index b4655d553..dac038c26 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.71" +__version__ = "0.3.72" diff --git a/pyproject.toml b/pyproject.toml index 280dbaf53..75df8fb7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.71" +version = "0.3.72" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From dff996ca39d86265bbabf15e666484e051f0b3d5 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 25 Nov 2025 11:30:24 -0800 Subject: [PATCH 618/759] Fix crash. (#10885) --- comfy/text_encoders/flux.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/text_encoders/flux.py b/comfy/text_encoders/flux.py index 8dbbca16e..024504a5b 100644 --- a/comfy/text_encoders/flux.py +++ b/comfy/text_encoders/flux.py @@ -87,6 +87,7 @@ def load_mistral_tokenizer(data): vocab = {} max_vocab = mistral_vocab["config"]["default_vocab_size"] + max_vocab -= len(mistral_vocab["special_tokens"]) for w in mistral_vocab["vocab"]: r = w["rank"] From 18b79acba95d44b4ea00bbbfc1856bc71bd58841 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 26 Nov 2025 03:58:21 +0800 Subject: [PATCH 619/759] Update workflow templates to v0.7.20 (#10883) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b7014f956..5f20816d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.30.6 -comfyui-workflow-templates==0.7.9 +comfyui-workflow-templates==0.7.20 comfyui-embedded-docs==0.3.1 torch torchsde From d196a905bb379a6d800d0c13f9b4fdea3965311a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 25 Nov 2025 11:58:39 -0800 Subject: [PATCH 620/759] Lower vram usage for flux 2 text encoder. (#10887) --- comfy/sd1_clip.py | 7 ++++--- comfy/text_encoders/flux.py | 4 ++-- comfy/text_encoders/llama.py | 12 +++++++++--- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 8f509bab1..0fc9ab3db 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -90,7 +90,6 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=True, enable_attention_masks=False, zero_out_masked=False, return_projected_pooled=True, return_attention_masks=False, model_options={}): # clip-vit-base-patch32 super().__init__() - assert layer in self.LAYERS if textmodel_json_config is None: textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_clip_config.json") @@ -164,7 +163,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): def set_clip_options(self, options): layer_idx = options.get("layer", self.layer_idx) self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled) - if self.layer == "all": + if isinstance(self.layer, list) or self.layer == "all": pass elif layer_idx is None or abs(layer_idx) > self.num_layers: self.layer = "last" @@ -266,7 +265,9 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): if self.enable_attention_masks: attention_mask_model = attention_mask - if self.layer == "all": + if isinstance(self.layer, list): + intermediate_output = self.layer + elif self.layer == "all": intermediate_output = "all" else: intermediate_output = self.layer_idx diff --git a/comfy/text_encoders/flux.py b/comfy/text_encoders/flux.py index 024504a5b..99f4812bb 100644 --- a/comfy/text_encoders/flux.py +++ b/comfy/text_encoders/flux.py @@ -138,7 +138,7 @@ class Flux2Tokenizer(sd1_clip.SD1Tokenizer): return tokens class Mistral3_24BModel(sd1_clip.SDClipModel): - def __init__(self, device="cpu", layer="all", layer_idx=None, dtype=None, attention_mask=True, model_options={}): + def __init__(self, device="cpu", layer=[10, 20, 30], layer_idx=None, dtype=None, attention_mask=True, model_options={}): textmodel_json_config = {} num_layers = model_options.get("num_layers", None) if num_layers is not None: @@ -154,7 +154,7 @@ class Flux2TEModel(sd1_clip.SD1ClipModel): def encode_token_weights(self, token_weight_pairs): out, pooled, extra = super().encode_token_weights(token_weight_pairs) - out = torch.stack((out[:, 10], out[:, 20], out[:, 30]), dim=1) + out = torch.stack((out[:, 0], out[:, 1], out[:, 2]), dim=1) out = out.movedim(1, 2) out = out.reshape(out.shape[0], out.shape[1], -1) return out, pooled, extra diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 749ff581b..d47ed27bc 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -434,8 +434,12 @@ class Llama2_(nn.Module): intermediate = None all_intermediate = None + only_layers = None if intermediate_output is not None: - if intermediate_output == "all": + if isinstance(intermediate_output, list): + all_intermediate = [] + only_layers = set(intermediate_output) + elif intermediate_output == "all": all_intermediate = [] intermediate_output = None elif intermediate_output < 0: @@ -443,7 +447,8 @@ class Llama2_(nn.Module): for i, layer in enumerate(self.layers): if all_intermediate is not None: - all_intermediate.append(x.unsqueeze(1).clone()) + if only_layers is None or (i in only_layers): + all_intermediate.append(x.unsqueeze(1).clone()) x = layer( x=x, attention_mask=mask, @@ -457,7 +462,8 @@ class Llama2_(nn.Module): x = self.norm(x) if all_intermediate is not None: - all_intermediate.append(x.unsqueeze(1).clone()) + if only_layers is None or ((i + 1) in only_layers): + all_intermediate.append(x.unsqueeze(1).clone()) if all_intermediate is not None: intermediate = torch.cat(all_intermediate, dim=1) From 0c18842acbdf546883b08808dd9feea7605d7649 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 25 Nov 2025 14:59:37 -0500 Subject: [PATCH 621/759] ComfyUI v0.3.73 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index dac038c26..f8818838e 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.72" +__version__ = "0.3.73" diff --git a/pyproject.toml b/pyproject.toml index 75df8fb7c..7e4bac12d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.72" +version = "0.3.73" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From e9aae31fa241a6a63a368800146ea91629d4e8c2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 25 Nov 2025 15:41:45 -0800 Subject: [PATCH 622/759] Z Image model. (#10892) --- comfy/ldm/lumina/model.py | 219 +++++++------------- comfy/ldm/modules/diffusionmodules/mmdit.py | 6 +- comfy/model_base.py | 4 + comfy/model_detection.py | 29 ++- comfy/sd.py | 8 + comfy/text_encoders/llama.py | 31 +++ comfy/text_encoders/z_image.py | 48 +++++ 7 files changed, 196 insertions(+), 149 deletions(-) create mode 100644 comfy/text_encoders/z_image.py diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index b4494a51d..c8643eb82 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -11,6 +11,7 @@ import comfy.ldm.common_dit from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder from comfy.ldm.modules.attention import optimized_attention_masked from comfy.ldm.flux.layers import EmbedND +from comfy.ldm.flux.math import apply_rope import comfy.patcher_extension @@ -31,6 +32,7 @@ class JointAttention(nn.Module): n_heads: int, n_kv_heads: Optional[int], qk_norm: bool, + out_bias: bool = False, operation_settings={}, ): """ @@ -59,7 +61,7 @@ class JointAttention(nn.Module): self.out = operation_settings.get("operations").Linear( n_heads * self.head_dim, dim, - bias=False, + bias=out_bias, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"), ) @@ -70,35 +72,6 @@ class JointAttention(nn.Module): else: self.q_norm = self.k_norm = nn.Identity() - @staticmethod - def apply_rotary_emb( - x_in: torch.Tensor, - freqs_cis: torch.Tensor, - ) -> torch.Tensor: - """ - Apply rotary embeddings to input tensors using the given frequency - tensor. - - This function applies rotary embeddings to the given query 'xq' and - key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The - input tensors are reshaped as complex numbers, and the frequency tensor - is reshaped for broadcasting compatibility. The resulting tensors - contain rotary embeddings and are returned as real tensors. - - Args: - x_in (torch.Tensor): Query or Key tensor to apply rotary embeddings. - freqs_cis (torch.Tensor): Precomputed frequency tensor for complex - exponentials. - - Returns: - Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor - and key tensor with rotary embeddings. - """ - - t_ = x_in.reshape(*x_in.shape[:-1], -1, 1, 2) - t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1] - return t_out.reshape(*x_in.shape) - def forward( self, x: torch.Tensor, @@ -134,8 +107,7 @@ class JointAttention(nn.Module): xq = self.q_norm(xq) xk = self.k_norm(xk) - xq = JointAttention.apply_rotary_emb(xq, freqs_cis=freqs_cis) - xk = JointAttention.apply_rotary_emb(xk, freqs_cis=freqs_cis) + xq, xk = apply_rope(xq, xk, freqs_cis) n_rep = self.n_local_heads // self.n_local_kv_heads if n_rep >= 1: @@ -215,6 +187,8 @@ class JointTransformerBlock(nn.Module): norm_eps: float, qk_norm: bool, modulation=True, + z_image_modulation=False, + attn_out_bias=False, operation_settings={}, ) -> None: """ @@ -235,10 +209,10 @@ class JointTransformerBlock(nn.Module): super().__init__() self.dim = dim self.head_dim = dim // n_heads - self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, operation_settings=operation_settings) + self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, out_bias=attn_out_bias, operation_settings=operation_settings) self.feed_forward = FeedForward( dim=dim, - hidden_dim=4 * dim, + hidden_dim=dim, multiple_of=multiple_of, ffn_dim_multiplier=ffn_dim_multiplier, operation_settings=operation_settings, @@ -252,16 +226,27 @@ class JointTransformerBlock(nn.Module): self.modulation = modulation if modulation: - self.adaLN_modulation = nn.Sequential( - nn.SiLU(), - operation_settings.get("operations").Linear( - min(dim, 1024), - 4 * dim, - bias=True, - device=operation_settings.get("device"), - dtype=operation_settings.get("dtype"), - ), - ) + if z_image_modulation: + self.adaLN_modulation = nn.Sequential( + operation_settings.get("operations").Linear( + min(dim, 256), + 4 * dim, + bias=True, + device=operation_settings.get("device"), + dtype=operation_settings.get("dtype"), + ), + ) + else: + self.adaLN_modulation = nn.Sequential( + nn.SiLU(), + operation_settings.get("operations").Linear( + min(dim, 1024), + 4 * dim, + bias=True, + device=operation_settings.get("device"), + dtype=operation_settings.get("dtype"), + ), + ) def forward( self, @@ -323,7 +308,7 @@ class FinalLayer(nn.Module): The final layer of NextDiT. """ - def __init__(self, hidden_size, patch_size, out_channels, operation_settings={}): + def __init__(self, hidden_size, patch_size, out_channels, z_image_modulation=False, operation_settings={}): super().__init__() self.norm_final = operation_settings.get("operations").LayerNorm( hidden_size, @@ -340,10 +325,15 @@ class FinalLayer(nn.Module): dtype=operation_settings.get("dtype"), ) + if z_image_modulation: + min_mod = 256 + else: + min_mod = 1024 + self.adaLN_modulation = nn.Sequential( nn.SiLU(), operation_settings.get("operations").Linear( - min(hidden_size, 1024), + min(hidden_size, min_mod), hidden_size, bias=True, device=operation_settings.get("device"), @@ -373,12 +363,16 @@ class NextDiT(nn.Module): n_heads: int = 32, n_kv_heads: Optional[int] = None, multiple_of: int = 256, - ffn_dim_multiplier: Optional[float] = None, + ffn_dim_multiplier: float = 4.0, norm_eps: float = 1e-5, qk_norm: bool = False, cap_feat_dim: int = 5120, axes_dims: List[int] = (16, 56, 56), axes_lens: List[int] = (1, 512, 512), + rope_theta=10000.0, + z_image_modulation=False, + time_scale=1.0, + pad_tokens_multiple=None, image_model=None, device=None, dtype=None, @@ -390,6 +384,8 @@ class NextDiT(nn.Module): self.in_channels = in_channels self.out_channels = in_channels self.patch_size = patch_size + self.time_scale = time_scale + self.pad_tokens_multiple = pad_tokens_multiple self.x_embedder = operation_settings.get("operations").Linear( in_features=patch_size * patch_size * in_channels, @@ -411,6 +407,7 @@ class NextDiT(nn.Module): norm_eps, qk_norm, modulation=True, + z_image_modulation=z_image_modulation, operation_settings=operation_settings, ) for layer_id in range(n_refiner_layers) @@ -434,7 +431,7 @@ class NextDiT(nn.Module): ] ) - self.t_embedder = TimestepEmbedder(min(dim, 1024), **operation_settings) + self.t_embedder = TimestepEmbedder(min(dim, 1024), output_size=256 if z_image_modulation else None, **operation_settings) self.cap_embedder = nn.Sequential( operation_settings.get("operations").RMSNorm(cap_feat_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")), operation_settings.get("operations").Linear( @@ -457,18 +454,24 @@ class NextDiT(nn.Module): ffn_dim_multiplier, norm_eps, qk_norm, + z_image_modulation=z_image_modulation, + attn_out_bias=False, operation_settings=operation_settings, ) for layer_id in range(n_layers) ] ) self.norm_final = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) - self.final_layer = FinalLayer(dim, patch_size, self.out_channels, operation_settings=operation_settings) + self.final_layer = FinalLayer(dim, patch_size, self.out_channels, z_image_modulation=z_image_modulation, operation_settings=operation_settings) + + if self.pad_tokens_multiple is not None: + self.x_pad_token = nn.Parameter(torch.empty((1, dim), device=device, dtype=dtype)) + self.cap_pad_token = nn.Parameter(torch.empty((1, dim), device=device, dtype=dtype)) assert (dim // n_heads) == sum(axes_dims) self.axes_dims = axes_dims self.axes_lens = axes_lens - self.rope_embedder = EmbedND(dim=dim // n_heads, theta=10000.0, axes_dim=axes_dims) + self.rope_embedder = EmbedND(dim=dim // n_heads, theta=rope_theta, axes_dim=axes_dims) self.dim = dim self.n_heads = n_heads @@ -503,108 +506,42 @@ class NextDiT(nn.Module): bsz = len(x) pH = pW = self.patch_size device = x[0].device - dtype = x[0].dtype - if cap_mask is not None: - l_effective_cap_len = cap_mask.sum(dim=1).tolist() - else: - l_effective_cap_len = [num_tokens] * bsz + if self.pad_tokens_multiple is not None: + pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple + cap_feats = torch.cat((cap_feats, self.cap_pad_token.to(device=cap_feats.device, dtype=cap_feats.dtype).unsqueeze(0).repeat(cap_feats.shape[0], pad_extra, 1)), dim=1) - if cap_mask is not None and not torch.is_floating_point(cap_mask): - cap_mask = (cap_mask - 1).to(dtype) * torch.finfo(dtype).max + cap_pos_ids = torch.zeros(bsz, cap_feats.shape[1], 3, dtype=torch.float32, device=device) + cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0 - img_sizes = [(img.size(1), img.size(2)) for img in x] - l_effective_img_len = [(H // pH) * (W // pW) for (H, W) in img_sizes] + B, C, H, W = x.shape + x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2)) - max_seq_len = max( - (cap_len+img_len for cap_len, img_len in zip(l_effective_cap_len, l_effective_img_len)) - ) - max_cap_len = max(l_effective_cap_len) - max_img_len = max(l_effective_img_len) + H_tokens, W_tokens = H // pH, W // pW + x_pos_ids = torch.zeros((bsz, x.shape[1], 3), dtype=torch.float32, device=device) + x_pos_ids[:, :, 0] = cap_feats.shape[1] + 1 + x_pos_ids[:, :, 1] = torch.arange(H_tokens, dtype=torch.float32, device=device).view(-1, 1).repeat(1, W_tokens).flatten() + x_pos_ids[:, :, 2] = torch.arange(W_tokens, dtype=torch.float32, device=device).view(1, -1).repeat(H_tokens, 1).flatten() - position_ids = torch.zeros(bsz, max_seq_len, 3, dtype=torch.float32, device=device) + if self.pad_tokens_multiple is not None: + pad_extra = (-x.shape[1]) % self.pad_tokens_multiple + x = torch.cat((x, self.x_pad_token.to(device=x.device, dtype=x.dtype).unsqueeze(0).repeat(x.shape[0], pad_extra, 1)), dim=1) + x_pos_ids = torch.nn.functional.pad(x_pos_ids, (0, 0, 0, pad_extra)) - for i in range(bsz): - cap_len = l_effective_cap_len[i] - img_len = l_effective_img_len[i] - H, W = img_sizes[i] - H_tokens, W_tokens = H // pH, W // pW - assert H_tokens * W_tokens == img_len - - rope_options = transformer_options.get("rope_options", None) - h_scale = 1.0 - w_scale = 1.0 - h_start = 0 - w_start = 0 - if rope_options is not None: - h_scale = rope_options.get("scale_y", 1.0) - w_scale = rope_options.get("scale_x", 1.0) - - h_start = rope_options.get("shift_y", 0.0) - w_start = rope_options.get("shift_x", 0.0) - - position_ids[i, :cap_len, 0] = torch.arange(cap_len, dtype=torch.float32, device=device) - position_ids[i, cap_len:cap_len+img_len, 0] = cap_len - row_ids = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten() - col_ids = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten() - position_ids[i, cap_len:cap_len+img_len, 1] = row_ids - position_ids[i, cap_len:cap_len+img_len, 2] = col_ids - - freqs_cis = self.rope_embedder(position_ids).movedim(1, 2).to(dtype) - - # build freqs_cis for cap and image individually - cap_freqs_cis_shape = list(freqs_cis.shape) - # cap_freqs_cis_shape[1] = max_cap_len - cap_freqs_cis_shape[1] = cap_feats.shape[1] - cap_freqs_cis = torch.zeros(*cap_freqs_cis_shape, device=device, dtype=freqs_cis.dtype) - - img_freqs_cis_shape = list(freqs_cis.shape) - img_freqs_cis_shape[1] = max_img_len - img_freqs_cis = torch.zeros(*img_freqs_cis_shape, device=device, dtype=freqs_cis.dtype) - - for i in range(bsz): - cap_len = l_effective_cap_len[i] - img_len = l_effective_img_len[i] - cap_freqs_cis[i, :cap_len] = freqs_cis[i, :cap_len] - img_freqs_cis[i, :img_len] = freqs_cis[i, cap_len:cap_len+img_len] + freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2) # refine context for layer in self.context_refiner: - cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis, transformer_options=transformer_options) + cap_feats = layer(cap_feats, cap_mask, freqs_cis[:, :cap_pos_ids.shape[1]], transformer_options=transformer_options) - # refine image - flat_x = [] - for i in range(bsz): - img = x[i] - C, H, W = img.size() - img = img.view(C, H // pH, pH, W // pW, pW).permute(1, 3, 2, 4, 0).flatten(2).flatten(0, 1) - flat_x.append(img) - x = flat_x - padded_img_embed = torch.zeros(bsz, max_img_len, x[0].shape[-1], device=device, dtype=x[0].dtype) - padded_img_mask = torch.zeros(bsz, max_img_len, dtype=dtype, device=device) - for i in range(bsz): - padded_img_embed[i, :l_effective_img_len[i]] = x[i] - padded_img_mask[i, l_effective_img_len[i]:] = -torch.finfo(dtype).max - - padded_img_embed = self.x_embedder(padded_img_embed) - padded_img_mask = padded_img_mask.unsqueeze(1) + padded_img_mask = None for layer in self.noise_refiner: - padded_img_embed = layer(padded_img_embed, padded_img_mask, img_freqs_cis, t, transformer_options=transformer_options) - - if cap_mask is not None: - mask = torch.zeros(bsz, max_seq_len, dtype=dtype, device=device) - mask[:, :max_cap_len] = cap_mask[:, :max_cap_len] - else: - mask = None - - padded_full_embed = torch.zeros(bsz, max_seq_len, self.dim, device=device, dtype=x[0].dtype) - for i in range(bsz): - cap_len = l_effective_cap_len[i] - img_len = l_effective_img_len[i] - - padded_full_embed[i, :cap_len] = cap_feats[i, :cap_len] - padded_full_embed[i, cap_len:cap_len+img_len] = padded_img_embed[i, :img_len] + x = layer(x, padded_img_mask, freqs_cis[:, cap_pos_ids.shape[1]:], t, transformer_options=transformer_options) + padded_full_embed = torch.cat((cap_feats, x), dim=1) + mask = None + img_sizes = [(H, W)] * bsz + l_effective_cap_len = [cap_feats.shape[1]] * bsz return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis def forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs): @@ -627,7 +564,7 @@ class NextDiT(nn.Module): y: (N,) tensor of text tokens/features """ - t = self.t_embedder(t, dtype=x.dtype) # (N, D) + t = self.t_embedder(t * self.time_scale, dtype=x.dtype) # (N, D) adaln_input = t cap_feats = self.cap_embedder(cap_feats) # (N, L, D) # todo check if able to batchify w.o. redundant compute diff --git a/comfy/ldm/modules/diffusionmodules/mmdit.py b/comfy/ldm/modules/diffusionmodules/mmdit.py index 42f406f1a..0dc8fe789 100644 --- a/comfy/ldm/modules/diffusionmodules/mmdit.py +++ b/comfy/ldm/modules/diffusionmodules/mmdit.py @@ -211,12 +211,14 @@ class TimestepEmbedder(nn.Module): Embeds scalar timesteps into vector representations. """ - def __init__(self, hidden_size, frequency_embedding_size=256, dtype=None, device=None, operations=None): + def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None): super().__init__() + if output_size is None: + output_size = hidden_size self.mlp = nn.Sequential( operations.Linear(frequency_embedding_size, hidden_size, bias=True, dtype=dtype, device=device), nn.SiLU(), - operations.Linear(hidden_size, hidden_size, bias=True, dtype=dtype, device=device), + operations.Linear(hidden_size, output_size, bias=True, dtype=dtype, device=device), ) self.frequency_embedding_size = frequency_embedding_size diff --git a/comfy/model_base.py b/comfy/model_base.py index cad79ecbd..cc21b1de9 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1114,9 +1114,13 @@ class Lumina2(BaseModel): if torch.numel(attention_mask) != attention_mask.sum(): out['attention_mask'] = comfy.conds.CONDRegular(attention_mask) out['num_tokens'] = comfy.conds.CONDConstant(max(1, torch.sum(attention_mask).item())) + cross_attn = kwargs.get("cross_attn", None) if cross_attn is not None: out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + if 'num_tokens' not in out: + out['num_tokens'] = comfy.conds.CONDConstant(cross_attn.shape[1]) + return out class WAN21(BaseModel): diff --git a/comfy/model_detection.py b/comfy/model_detection.py index b2ba1459d..7afe4a798 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -416,14 +416,31 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["image_model"] = "lumina2" dit_config["patch_size"] = 2 dit_config["in_channels"] = 16 - dit_config["dim"] = 2304 - dit_config["cap_feat_dim"] = state_dict['{}cap_embedder.1.weight'.format(key_prefix)].shape[1] + w = state_dict['{}cap_embedder.1.weight'.format(key_prefix)] + dit_config["dim"] = w.shape[0] + dit_config["cap_feat_dim"] = w.shape[1] dit_config["n_layers"] = count_blocks(state_dict_keys, '{}layers.'.format(key_prefix) + '{}.') - dit_config["n_heads"] = 24 - dit_config["n_kv_heads"] = 8 dit_config["qk_norm"] = True - dit_config["axes_dims"] = [32, 32, 32] - dit_config["axes_lens"] = [300, 512, 512] + + if dit_config["dim"] == 2304: # Original Lumina 2 + dit_config["n_heads"] = 24 + dit_config["n_kv_heads"] = 8 + dit_config["axes_dims"] = [32, 32, 32] + dit_config["axes_lens"] = [300, 512, 512] + dit_config["rope_theta"] = 10000.0 + dit_config["ffn_dim_multiplier"] = 4.0 + elif dit_config["dim"] == 3840: # Z image + dit_config["n_heads"] = 30 + dit_config["n_kv_heads"] = 30 + dit_config["axes_dims"] = [32, 48, 48] + dit_config["axes_lens"] = [1536, 512, 512] + dit_config["rope_theta"] = 256.0 + dit_config["ffn_dim_multiplier"] = (8.0 / 3.0) + dit_config["z_image_modulation"] = True + dit_config["time_scale"] = 1000.0 + if '{}cap_pad_token'.format(key_prefix) in state_dict_keys: + dit_config["pad_tokens_multiple"] = 32 + return dit_config if '{}head.modulation'.format(key_prefix) in state_dict_keys: # Wan 2.1 diff --git a/comfy/sd.py b/comfy/sd.py index 14dd8944c..350fae92b 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -52,6 +52,7 @@ import comfy.text_encoders.ace import comfy.text_encoders.omnigen2 import comfy.text_encoders.qwen_image import comfy.text_encoders.hunyuan_image +import comfy.text_encoders.z_image import comfy.model_patcher import comfy.lora @@ -953,6 +954,8 @@ class TEModel(Enum): GEMMA_3_4B = 13 MISTRAL3_24B = 14 MISTRAL3_24B_PRUNED_FLUX2 = 15 + QWEN3_4B = 16 + def detect_te_model(sd): if "text_model.encoder.layers.30.mlp.fc1.weight" in sd: @@ -985,6 +988,8 @@ def detect_te_model(sd): if weight.shape[0] == 512: return TEModel.QWEN25_7B if "model.layers.0.post_attention_layernorm.weight" in sd: + if 'model.layers.0.self_attn.q_norm.weight' in sd: + return TEModel.QWEN3_4B weight = sd['model.layers.0.post_attention_layernorm.weight'] if weight.shape[0] == 5120: if "model.layers.39.post_attention_layernorm.weight" in sd: @@ -1110,6 +1115,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.flux.flux2_te(**llama_detect(clip_data), pruned=te_model == TEModel.MISTRAL3_24B_PRUNED_FLUX2) clip_target.tokenizer = comfy.text_encoders.flux.Flux2Tokenizer tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None) + elif te_model == TEModel.QWEN3_4B: + clip_target.clip = comfy.text_encoders.z_image.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.z_image.ZImageTokenizer else: # clip_l if clip_type == CLIPType.SD3: diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index d47ed27bc..cd4b5f76c 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -78,6 +78,28 @@ class Qwen25_3BConfig: rope_scale = None final_norm: bool = True +@dataclass +class Qwen3_4BConfig: + vocab_size: int = 151936 + hidden_size: int = 2560 + intermediate_size: int = 9728 + num_hidden_layers: int = 36 + num_attention_heads: int = 32 + num_key_value_heads: int = 8 + max_position_embeddings: int = 40960 + rms_norm_eps: float = 1e-6 + rope_theta: float = 1000000.0 + transformer_type: str = "llama" + head_dim = 128 + rms_norm_add = False + mlp_activation = "silu" + qkv_bias = False + rope_dims = None + q_norm = "gemma3" + k_norm = "gemma3" + rope_scale = None + final_norm: bool = True + @dataclass class Qwen25_7BVLI_Config: vocab_size: int = 152064 @@ -511,6 +533,15 @@ class Qwen25_3B(BaseLlama, torch.nn.Module): self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) self.dtype = dtype +class Qwen3_4B(BaseLlama, torch.nn.Module): + def __init__(self, config_dict, dtype, device, operations): + super().__init__() + config = Qwen3_4BConfig(**config_dict) + self.num_layers = config.num_hidden_layers + + self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) + self.dtype = dtype + class Qwen25_7BVLI(BaseLlama, torch.nn.Module): def __init__(self, config_dict, dtype, device, operations): super().__init__() diff --git a/comfy/text_encoders/z_image.py b/comfy/text_encoders/z_image.py new file mode 100644 index 000000000..bb9273b20 --- /dev/null +++ b/comfy/text_encoders/z_image.py @@ -0,0 +1,48 @@ +from transformers import Qwen2Tokenizer +import comfy.text_encoders.llama +from comfy import sd1_clip +import os + +class Qwen3Tokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer") + super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2560, embedding_key='qwen3_4b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data) + + +class ZImageTokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_4b", tokenizer=Qwen3Tokenizer) + self.llama_template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" + + def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs): + if llama_template is None: + llama_text = self.llama_template.format(text) + else: + llama_text = llama_template.format(text) + + tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs) + return tokens + + +class Qwen3_4BModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}): + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen3_4B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) + + +class ZImageTEModel(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + super().__init__(device=device, dtype=dtype, name="qwen3_4b", clip_model=Qwen3_4BModel, model_options=model_options) + + +def te(dtype_llama=None, llama_scaled_fp8=None, llama_quantization_metadata=None): + class ZImageTEModel_(ZImageTEModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: + model_options = model_options.copy() + model_options["scaled_fp8"] = llama_scaled_fp8 + if dtype_llama is not None: + dtype = dtype_llama + if llama_quantization_metadata is not None: + model_options["quantization_metadata"] = llama_quantization_metadata + super().__init__(device=device, dtype=dtype, model_options=model_options) + return ZImageTEModel_ From 0e24dbb19f34f242edb77c550396cf6806f7b22f Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 25 Nov 2025 16:02:51 -0800 Subject: [PATCH 623/759] Adjustments to Z Image. (#10893) --- comfy/supported_models.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 8fe8e63f6..af8120400 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -21,6 +21,7 @@ import comfy.text_encoders.ace import comfy.text_encoders.omnigen2 import comfy.text_encoders.qwen_image import comfy.text_encoders.hunyuan_image +import comfy.text_encoders.z_image from . import supported_models_base from . import latent_formats @@ -994,7 +995,7 @@ class Lumina2(supported_models_base.BASE): "shift": 6.0, } - memory_usage_factor = 1.2 + memory_usage_factor = 1.4 unet_extra_config = {} latent_format = latent_formats.Flux @@ -1013,6 +1014,24 @@ class Lumina2(supported_models_base.BASE): hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}gemma2_2b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.lumina2.LuminaTokenizer, comfy.text_encoders.lumina2.te(**hunyuan_detect)) +class ZImage(Lumina2): + unet_config = { + "image_model": "lumina2", + "dim": 3840, + } + + sampling_settings = { + "multiplier": 1.0, + "shift": 3.0, + } + + memory_usage_factor = 1.7 + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_4b.transformer.".format(pref)) + return supported_models_base.ClipTarget(comfy.text_encoders.z_image.ZImageTokenizer, comfy.text_encoders.z_image.te(**hunyuan_detect)) + class WAN21_T2V(supported_models_base.BASE): unet_config = { "image_model": "wan2.1", @@ -1453,7 +1472,7 @@ class HunyuanVideo15_SR_Distilled(HunyuanVideo): hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2] models += [SVD_img2vid] From bdb10a583f1b1e495ee00dbd1674f11016a6a93e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 25 Nov 2025 21:07:58 -0800 Subject: [PATCH 624/759] Fix loras not working on mixed fp8. (#10899) --- comfy/model_patcher.py | 2 +- comfy/ops.py | 22 +++++++++++++++++++++- comfy/quant_ops.py | 21 ++++++++++++++------- comfy/weight_adapter/lora.py | 1 + 4 files changed, 37 insertions(+), 9 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 6551ced5a..73adc7f70 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -132,7 +132,7 @@ class LowVramPatch: def __call__(self, weight): intermediate_dtype = weight.dtype if self.convert_func is not None: - weight = self.convert_func(weight.to(dtype=torch.float32, copy=True), inplace=True) + weight = self.convert_func(weight, inplace=False) if intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops intermediate_dtype = torch.float32 diff --git a/comfy/ops.py b/comfy/ops.py index 785aa1c9f..a0ff4e8f1 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -117,6 +117,8 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of if weight_has_function or weight.dtype != dtype: with wf_context: weight = weight.to(dtype=dtype) + if isinstance(weight, QuantizedTensor): + weight = weight.dequantize() for f in s.weight_function: weight = f(weight) @@ -502,7 +504,7 @@ def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None weight *= self.scale_weight.to(device=weight.device, dtype=weight.dtype) return weight else: - return weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype) + return weight.to(dtype=torch.float32) * self.scale_weight.to(device=weight.device, dtype=torch.float32) def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs): weight = comfy.float.stochastic_rounding(weight / self.scale_weight.to(device=weight.device, dtype=weight.dtype), self.weight.dtype, seed=seed) @@ -643,6 +645,24 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful not isinstance(input, QuantizedTensor)): input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, dtype=self.weight.dtype) return self._forward(input, self.weight, self.bias) + + def convert_weight(self, weight, inplace=False, **kwargs): + if isinstance(weight, QuantizedTensor): + return weight.dequantize() + else: + return weight + + def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs): + if getattr(self, 'layout_type', None) is not None: + weight = QuantizedTensor.from_float(weight, self.layout_type, scale=None, dtype=self.weight.dtype, stochastic_rounding=seed, inplace_ops=True) + else: + weight = weight.to(self.weight.dtype) + if return_weight: + return weight + + assert inplace_update is False # TODO: eventually remove the inplace_update stuff + self.weight = torch.nn.Parameter(weight, requires_grad=False) + return MixedPrecisionOps def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None, model_config=None): diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index 0c16bcf8d..d2f3e7397 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -1,6 +1,7 @@ import torch import logging from typing import Tuple, Dict +import comfy.float _LAYOUT_REGISTRY = {} _GENERIC_UTILS = {} @@ -393,7 +394,7 @@ class TensorCoreFP8Layout(QuantizedLayout): - orig_dtype: Original dtype before quantization (for casting back) """ @classmethod - def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn): + def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn, stochastic_rounding=0, inplace_ops=False): orig_dtype = tensor.dtype if scale is None: @@ -403,17 +404,23 @@ class TensorCoreFP8Layout(QuantizedLayout): scale = torch.tensor(scale) scale = scale.to(device=tensor.device, dtype=torch.float32) - tensor_scaled = tensor * (1.0 / scale).to(tensor.dtype) - # TODO: uncomment this if it's actually needed because the clamp has a small performance penality' - lp_amax = torch.finfo(dtype).max - torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled) - qdata = tensor_scaled.to(dtype, memory_format=torch.contiguous_format) + if inplace_ops: + tensor *= (1.0 / scale).to(tensor.dtype) + else: + tensor = tensor * (1.0 / scale).to(tensor.dtype) + + if stochastic_rounding > 0: + tensor = comfy.float.stochastic_rounding(tensor, dtype=dtype, seed=stochastic_rounding) + else: + lp_amax = torch.finfo(dtype).max + torch.clamp(tensor, min=-lp_amax, max=lp_amax, out=tensor) + tensor = tensor.to(dtype, memory_format=torch.contiguous_format) layout_params = { 'scale': scale, 'orig_dtype': orig_dtype } - return qdata, layout_params + return tensor, layout_params @staticmethod def dequantize(qdata, scale, orig_dtype, **kwargs): diff --git a/comfy/weight_adapter/lora.py b/comfy/weight_adapter/lora.py index 4db004e50..3cc60bb1b 100644 --- a/comfy/weight_adapter/lora.py +++ b/comfy/weight_adapter/lora.py @@ -194,6 +194,7 @@ class LoRAAdapter(WeightAdapterBase): lora_diff = torch.mm( mat1.flatten(start_dim=1), mat2.flatten(start_dim=1) ).reshape(weight.shape) + del mat1, mat2 if dora_scale is not None: weight = weight_decompose( dora_scale, From 90b3995ec842335e44d70e0521ff6ff6c3ff9aaa Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 26 Nov 2025 00:34:15 -0500 Subject: [PATCH 625/759] ComfyUI v0.3.74 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index f8818838e..b565c7367 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.73" +__version__ = "0.3.74" diff --git a/pyproject.toml b/pyproject.toml index 7e4bac12d..ccf0fcdb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.73" +version = "0.3.74" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 58b85746618e2bc2dd32024c89403926aad59f48 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 25 Nov 2025 23:36:19 -0800 Subject: [PATCH 626/759] Fix Flux2 reference image mem estimation. (#10905) --- comfy/model_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index cc21b1de9..9b76c285e 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -926,7 +926,7 @@ class Flux(BaseModel): out = {} ref_latents = kwargs.get("reference_latents", None) if ref_latents is not None: - out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16]) + out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()[2:]), ref_latents))]) return out class Flux2(Flux): From 8402c8700a29a97bc5d706d6a0b14c41bc2c2d8a Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 26 Nov 2025 02:41:13 -0500 Subject: [PATCH 627/759] ComfyUI version v0.3.75 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index b565c7367..fa4b4f4b0 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.74" +__version__ = "0.3.75" diff --git a/pyproject.toml b/pyproject.toml index ccf0fcdb9..9009e65fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.74" +version = "0.3.75" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From f16219e3aadcb7a301a1a313ab8989c3ebe53764 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 26 Nov 2025 01:00:43 -0800 Subject: [PATCH 628/759] Add cheap latent preview for flux 2. (#10907) Thank you to the person who calculated them. You saved me a percent of my time. --- comfy/latent_formats.py | 40 ++++++++++++++++++++++++++++++++++++++++ latent_preview.py | 7 +++++-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index e98c7d6d8..8e110f45d 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -6,6 +6,7 @@ class LatentFormat: latent_dimensions = 2 latent_rgb_factors = None latent_rgb_factors_bias = None + latent_rgb_factors_reshape = None taesd_decoder_name = None def process_in(self, latent): @@ -181,6 +182,45 @@ class Flux(SD3): class Flux2(LatentFormat): latent_channels = 128 + def __init__(self): + self.latent_rgb_factors =[ + [0.0058, 0.0113, 0.0073], + [0.0495, 0.0443, 0.0836], + [-0.0099, 0.0096, 0.0644], + [0.2144, 0.3009, 0.3652], + [0.0166, -0.0039, -0.0054], + [0.0157, 0.0103, -0.0160], + [-0.0398, 0.0902, -0.0235], + [-0.0052, 0.0095, 0.0109], + [-0.3527, -0.2712, -0.1666], + [-0.0301, -0.0356, -0.0180], + [-0.0107, 0.0078, 0.0013], + [0.0746, 0.0090, -0.0941], + [0.0156, 0.0169, 0.0070], + [-0.0034, -0.0040, -0.0114], + [0.0032, 0.0181, 0.0080], + [-0.0939, -0.0008, 0.0186], + [0.0018, 0.0043, 0.0104], + [0.0284, 0.0056, -0.0127], + [-0.0024, -0.0022, -0.0030], + [0.1207, -0.0026, 0.0065], + [0.0128, 0.0101, 0.0142], + [0.0137, -0.0072, -0.0007], + [0.0095, 0.0092, -0.0059], + [0.0000, -0.0077, -0.0049], + [-0.0465, -0.0204, -0.0312], + [0.0095, 0.0012, -0.0066], + [0.0290, -0.0034, 0.0025], + [0.0220, 0.0169, -0.0048], + [-0.0332, -0.0457, -0.0468], + [-0.0085, 0.0389, 0.0609], + [-0.0076, 0.0003, -0.0043], + [-0.0111, -0.0460, -0.0614], + ] + + self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851] + self.latent_rgb_factors_reshape = lambda t: t.reshape(t.shape[0], 32, 2, 2, t.shape[-2], t.shape[-1]).permute(0, 1, 4, 2, 5, 3).reshape(t.shape[0], 32, t.shape[-2] * 2, t.shape[-1] * 2) + def process_in(self, latent): return latent diff --git a/latent_preview.py b/latent_preview.py index 95d3cb733..ddf6dcf49 100644 --- a/latent_preview.py +++ b/latent_preview.py @@ -37,13 +37,16 @@ class TAESDPreviewerImpl(LatentPreviewer): class Latent2RGBPreviewer(LatentPreviewer): - def __init__(self, latent_rgb_factors, latent_rgb_factors_bias=None): + def __init__(self, latent_rgb_factors, latent_rgb_factors_bias=None, latent_rgb_factors_reshape=None): self.latent_rgb_factors = torch.tensor(latent_rgb_factors, device="cpu").transpose(0, 1) self.latent_rgb_factors_bias = None if latent_rgb_factors_bias is not None: self.latent_rgb_factors_bias = torch.tensor(latent_rgb_factors_bias, device="cpu") + self.latent_rgb_factors_reshape = latent_rgb_factors_reshape def decode_latent_to_preview(self, x0): + if self.latent_rgb_factors_reshape is not None: + x0 = self.latent_rgb_factors_reshape(x0) self.latent_rgb_factors = self.latent_rgb_factors.to(dtype=x0.dtype, device=x0.device) if self.latent_rgb_factors_bias is not None: self.latent_rgb_factors_bias = self.latent_rgb_factors_bias.to(dtype=x0.dtype, device=x0.device) @@ -85,7 +88,7 @@ def get_previewer(device, latent_format): if previewer is None: if latent_format.latent_rgb_factors is not None: - previewer = Latent2RGBPreviewer(latent_format.latent_rgb_factors, latent_format.latent_rgb_factors_bias) + previewer = Latent2RGBPreviewer(latent_format.latent_rgb_factors, latent_format.latent_rgb_factors_bias, latent_format.latent_rgb_factors_reshape) return previewer def prepare_callback(model, steps, x0_output_dict=None): From 8938aa3f3064415758fa8f3a628476535a676183 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 26 Nov 2025 19:14:02 +0200 Subject: [PATCH 629/759] add Veo3 First-Last-Frame node (#10878) --- comfy_api_nodes/apis/veo_api.py | 38 +++----- comfy_api_nodes/nodes_veo2.py | 155 ++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+), 25 deletions(-) diff --git a/comfy_api_nodes/apis/veo_api.py b/comfy_api_nodes/apis/veo_api.py index a55137afb..8328d1aa4 100644 --- a/comfy_api_nodes/apis/veo_api.py +++ b/comfy_api_nodes/apis/veo_api.py @@ -1,34 +1,21 @@ -from typing import Optional, Union -from enum import Enum +from typing import Optional from pydantic import BaseModel, Field -class Image2(BaseModel): - bytesBase64Encoded: str - gcsUri: Optional[str] = None - mimeType: Optional[str] = None +class VeoRequestInstanceImage(BaseModel): + bytesBase64Encoded: str | None = Field(None) + gcsUri: str | None = Field(None) + mimeType: str | None = Field(None) -class Image3(BaseModel): - bytesBase64Encoded: Optional[str] = None - gcsUri: str - mimeType: Optional[str] = None - - -class Instance1(BaseModel): - image: Optional[Union[Image2, Image3]] = Field( - None, description='Optional image to guide video generation' - ) +class VeoRequestInstance(BaseModel): + image: VeoRequestInstanceImage | None = Field(None) + lastFrame: VeoRequestInstanceImage | None = Field(None) prompt: str = Field(..., description='Text description of the video') -class PersonGeneration1(str, Enum): - ALLOW = 'ALLOW' - BLOCK = 'BLOCK' - - -class Parameters1(BaseModel): +class VeoRequestParameters(BaseModel): aspectRatio: Optional[str] = Field(None, examples=['16:9']) durationSeconds: Optional[int] = None enhancePrompt: Optional[bool] = None @@ -37,17 +24,18 @@ class Parameters1(BaseModel): description='Generate audio for the video. Only supported by veo 3 models.', ) negativePrompt: Optional[str] = None - personGeneration: Optional[PersonGeneration1] = None + personGeneration: str | None = Field(None, description="ALLOW or BLOCK") sampleCount: Optional[int] = None seed: Optional[int] = None storageUri: Optional[str] = Field( None, description='Optional Cloud Storage URI to upload the video' ) + resolution: str | None = Field(None) class VeoGenVidRequest(BaseModel): - instances: Optional[list[Instance1]] = None - parameters: Optional[Parameters1] = None + instances: list[VeoRequestInstance] | None = Field(None) + parameters: VeoRequestParameters | None = Field(None) class VeoGenVidResponse(BaseModel): diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index d37e9e9b4..a54dc13ab 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -1,6 +1,7 @@ import base64 from io import BytesIO +import torch from typing_extensions import override from comfy_api.input_impl.video_types import VideoFromFile @@ -10,6 +11,9 @@ from comfy_api_nodes.apis.veo_api import ( VeoGenVidPollResponse, VeoGenVidRequest, VeoGenVidResponse, + VeoRequestInstance, + VeoRequestInstanceImage, + VeoRequestParameters, ) from comfy_api_nodes.util import ( ApiEndpoint, @@ -346,12 +350,163 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): ) +class Veo3FirstLastFrameNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="Veo3FirstLastFrameNode", + display_name="Google Veo 3 First-Last-Frame to Video", + category="api node/video/Veo", + description="Generate video using prompt and first and last frames.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text description of the video", + ), + IO.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative text prompt to guide what to avoid in the video", + ), + IO.Combo.Input("resolution", options=["720p", "1080p"]), + IO.Combo.Input( + "aspect_ratio", + options=["16:9", "9:16"], + default="16:9", + tooltip="Aspect ratio of the output video", + ), + IO.Int.Input( + "duration", + default=8, + min=4, + max=8, + step=2, + display_mode=IO.NumberDisplay.slider, + tooltip="Duration of the output video in seconds", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFF, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed for video generation", + ), + IO.Image.Input("first_frame", tooltip="Start frame"), + IO.Image.Input("last_frame", tooltip="End frame"), + IO.Combo.Input( + "model", + options=["veo-3.1-generate", "veo-3.1-fast-generate"], + default="veo-3.1-fast-generate", + ), + IO.Boolean.Input( + "generate_audio", + default=True, + tooltip="Generate audio for the video.", + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + prompt: str, + negative_prompt: str, + resolution: str, + aspect_ratio: str, + duration: int, + seed: int, + first_frame: torch.Tensor, + last_frame: torch.Tensor, + model: str, + generate_audio: bool, + ): + model = MODELS_MAP[model] + initial_response = await sync_op( + cls, + ApiEndpoint(path=f"/proxy/veo/{model}/generate", method="POST"), + response_model=VeoGenVidResponse, + data=VeoGenVidRequest( + instances=[ + VeoRequestInstance( + prompt=prompt, + image=VeoRequestInstanceImage( + bytesBase64Encoded=tensor_to_base64_string(first_frame), mimeType="image/png" + ), + lastFrame=VeoRequestInstanceImage( + bytesBase64Encoded=tensor_to_base64_string(last_frame), mimeType="image/png" + ), + ), + ], + parameters=VeoRequestParameters( + aspectRatio=aspect_ratio, + personGeneration="ALLOW", + durationSeconds=duration, + enhancePrompt=True, # cannot be False for Veo3 + seed=seed, + generateAudio=generate_audio, + negativePrompt=negative_prompt, + resolution=resolution, + ), + ), + ) + poll_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/veo/{model}/poll", method="POST"), + response_model=VeoGenVidPollResponse, + status_extractor=lambda r: "completed" if r.done else "pending", + data=VeoGenVidPollRequest( + operationName=initial_response.name, + ), + poll_interval=5.0, + estimated_duration=AVERAGE_DURATION_VIDEO_GEN, + ) + + if poll_response.error: + raise Exception(f"Veo API error: {poll_response.error.message} (code: {poll_response.error.code})") + + response = poll_response.response + filtered_count = response.raiMediaFilteredCount + if filtered_count: + reasons = response.raiMediaFilteredReasons or [] + reason_part = f": {reasons[0]}" if reasons else "" + raise Exception( + f"Content blocked by Google's Responsible AI filters{reason_part} " + f"({filtered_count} video{'s' if filtered_count != 1 else ''} filtered)." + ) + + if response.videos: + video = response.videos[0] + if video.bytesBase64Encoded: + return IO.NodeOutput(VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded)))) + if video.gcsUri: + return IO.NodeOutput(await download_url_to_video_output(video.gcsUri)) + raise Exception("Video returned but no data or URL was provided") + raise Exception("Video generation completed but no video was returned") + + class VeoExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ VeoVideoGenerationNode, Veo3VideoGenerationNode, + Veo3FirstLastFrameNode, ] From 1105e0d139001ad602d0f883406bfce41e54ae67 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 26 Nov 2025 19:23:14 +0200 Subject: [PATCH 630/759] improve UX for batch uploads in upload_images_to_comfyapi (#10913) --- comfy_api_nodes/util/upload_helpers.py | 43 +++++++++++++------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/comfy_api_nodes/util/upload_helpers.py b/comfy_api_nodes/util/upload_helpers.py index 632450d9b..b9019841f 100644 --- a/comfy_api_nodes/util/upload_helpers.py +++ b/comfy_api_nodes/util/upload_helpers.py @@ -4,7 +4,7 @@ import logging import time import uuid from io import BytesIO -from typing import Optional, Union +from typing import Optional from urllib.parse import urlparse import aiohttp @@ -48,8 +48,9 @@ async def upload_images_to_comfyapi( image: torch.Tensor, *, max_images: int = 8, - mime_type: Optional[str] = None, - wait_label: Optional[str] = "Uploading", + mime_type: str | None = None, + wait_label: str | None = "Uploading", + show_batch_index: bool = True, ) -> list[str]: """ Uploads images to ComfyUI API and returns download URLs. @@ -59,11 +60,18 @@ async def upload_images_to_comfyapi( download_urls: list[str] = [] is_batch = len(image.shape) > 3 batch_len = image.shape[0] if is_batch else 1 + num_to_upload = min(batch_len, max_images) + batch_start_ts = time.monotonic() - for idx in range(min(batch_len, max_images)): + for idx in range(num_to_upload): tensor = image[idx] if is_batch else image img_io = tensor_to_bytesio(tensor, mime_type=mime_type) - url = await upload_file_to_comfyapi(cls, img_io, img_io.name, mime_type, wait_label) + + effective_label = wait_label + if wait_label and show_batch_index and num_to_upload > 1: + effective_label = f"{wait_label} ({idx + 1}/{num_to_upload})" + + url = await upload_file_to_comfyapi(cls, img_io, img_io.name, mime_type, effective_label, batch_start_ts) download_urls.append(url) return download_urls @@ -126,8 +134,9 @@ async def upload_file_to_comfyapi( cls: type[IO.ComfyNode], file_bytes_io: BytesIO, filename: str, - upload_mime_type: Optional[str], - wait_label: Optional[str] = "Uploading", + upload_mime_type: str | None, + wait_label: str | None = "Uploading", + progress_origin_ts: float | None = None, ) -> str: """Uploads a single file to ComfyUI API and returns its download URL.""" if upload_mime_type is None: @@ -148,6 +157,7 @@ async def upload_file_to_comfyapi( file_bytes_io, content_type=upload_mime_type, wait_label=wait_label, + progress_origin_ts=progress_origin_ts, ) return create_resp.download_url @@ -155,27 +165,18 @@ async def upload_file_to_comfyapi( async def upload_file( cls: type[IO.ComfyNode], upload_url: str, - file: Union[BytesIO, str], + file: BytesIO | str, *, - content_type: Optional[str] = None, + content_type: str | None = None, max_retries: int = 3, retry_delay: float = 1.0, retry_backoff: float = 2.0, - wait_label: Optional[str] = None, + wait_label: str | None = None, + progress_origin_ts: float | None = None, ) -> None: """ Upload a file to a signed URL (e.g., S3 pre-signed PUT) with retries, Comfy progress display, and interruption. - Args: - cls: Node class (provides auth context + UI progress hooks). - upload_url: Pre-signed PUT URL. - file: BytesIO or path string. - content_type: Explicit MIME type. If None, we *suppress* Content-Type. - max_retries: Maximum retry attempts. - retry_delay: Initial delay in seconds. - retry_backoff: Exponential backoff factor. - wait_label: Progress label shown in Comfy UI. - Raises: ProcessingInterrupted, LocalNetworkError, ApiServerError, Exception """ @@ -198,7 +199,7 @@ async def upload_file( attempt = 0 delay = retry_delay - start_ts = time.monotonic() + start_ts = progress_origin_ts if progress_origin_ts is not None else time.monotonic() op_uuid = uuid.uuid4().hex[:8] while True: attempt += 1 From 8908ee262862f1252d1363d55c59872fb3361066 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 26 Nov 2025 20:38:30 +0200 Subject: [PATCH 631/759] fix(gemini): use first 10 images as fileData (URLs) and remaining images as inline base64 (#10918) --- comfy_api_nodes/apis/gemini_api.py | 6 ++++ comfy_api_nodes/nodes_gemini.py | 55 ++++++++++++++++++++---------- 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/comfy_api_nodes/apis/gemini_api.py b/comfy_api_nodes/apis/gemini_api.py index d34590d28..a380ecc86 100644 --- a/comfy_api_nodes/apis/gemini_api.py +++ b/comfy_api_nodes/apis/gemini_api.py @@ -58,8 +58,14 @@ class GeminiInlineData(BaseModel): mimeType: GeminiMimeType | None = Field(None) +class GeminiFileData(BaseModel): + fileUri: str | None = Field(None) + mimeType: GeminiMimeType | None = Field(None) + + class GeminiPart(BaseModel): inlineData: GeminiInlineData | None = Field(None) + fileData: GeminiFileData | None = Field(None) text: str | None = Field(None) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 938a20f84..976d9c225 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -20,6 +20,7 @@ from comfy_api.latest import IO, ComfyExtension, Input from comfy_api.util import VideoCodec, VideoContainer from comfy_api_nodes.apis.gemini_api import ( GeminiContent, + GeminiFileData, GeminiGenerateContentRequest, GeminiGenerateContentResponse, GeminiImageConfig, @@ -38,6 +39,7 @@ from comfy_api_nodes.util import ( get_number_of_images, sync_op, tensor_to_base64_string, + upload_images_to_comfyapi, validate_string, video_to_base64_string, ) @@ -68,24 +70,43 @@ class GeminiImageModel(str, Enum): gemini_2_5_flash_image = "gemini-2.5-flash-image" -def create_image_parts(image_input: torch.Tensor) -> list[GeminiPart]: - """ - Convert image tensor input to Gemini API compatible parts. - - Args: - image_input: Batch of image tensors from ComfyUI. - - Returns: - List of GeminiPart objects containing the encoded images. - """ +async def create_image_parts( + cls: type[IO.ComfyNode], + images: torch.Tensor, + image_limit: int = 0, +) -> list[GeminiPart]: image_parts: list[GeminiPart] = [] - for image_index in range(image_input.shape[0]): - image_as_b64 = tensor_to_base64_string(image_input[image_index].unsqueeze(0)) + if image_limit < 0: + raise ValueError("image_limit must be greater than or equal to 0 when creating Gemini image parts.") + total_images = get_number_of_images(images) + if total_images <= 0: + raise ValueError("No images provided to create_image_parts; at least one image is required.") + + # If image_limit == 0 --> use all images; otherwise clamp to image_limit. + effective_max = total_images if image_limit == 0 else min(total_images, image_limit) + + # Number of images we'll send as URLs (fileData) + num_url_images = min(effective_max, 10) # Vertex API max number of image links + reference_images_urls = await upload_images_to_comfyapi( + cls, + images, + max_images=num_url_images, + ) + for reference_image_url in reference_images_urls: + image_parts.append( + GeminiPart( + fileData=GeminiFileData( + mimeType=GeminiMimeType.image_png, + fileUri=reference_image_url, + ) + ) + ) + for idx in range(num_url_images, effective_max): image_parts.append( GeminiPart( inlineData=GeminiInlineData( mimeType=GeminiMimeType.image_png, - data=image_as_b64, + data=tensor_to_base64_string(images[idx]), ) ) ) @@ -338,8 +359,7 @@ class GeminiNode(IO.ComfyNode): # Add other modal parts if images is not None: - image_parts = create_image_parts(images) - parts.extend(image_parts) + parts.extend(await create_image_parts(cls, images)) if audio is not None: parts.extend(cls.create_audio_parts(audio)) if video is not None: @@ -562,8 +582,7 @@ class GeminiImage(IO.ComfyNode): image_config = GeminiImageConfig(aspectRatio=aspect_ratio) if images is not None: - image_parts = create_image_parts(images) - parts.extend(image_parts) + parts.extend(await create_image_parts(cls, images)) if files is not None: parts.extend(files) @@ -702,7 +721,7 @@ class GeminiImage2(IO.ComfyNode): if images is not None: if get_number_of_images(images) > 14: raise ValueError("The current maximum number of supported images is 14.") - parts.extend(create_image_parts(images)) + parts.extend(await create_image_parts(cls, images)) if files is not None: parts.extend(files) From 234c3dc85f7e61a537bbf6d8999c5880c5e0b746 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Wed, 26 Nov 2025 11:58:08 -0800 Subject: [PATCH 632/759] Bump frontend to 1.32.9 (#10867) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5f20816d6..9291552d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.30.6 +comfyui-frontend-package==1.32.9 comfyui-workflow-templates==0.7.20 comfyui-embedded-docs==0.3.1 torch From 58c6ed541d5aaf6d9b12f63bc23c33164e1cf7a3 Mon Sep 17 00:00:00 2001 From: Terry Jia Date: Wed, 26 Nov 2025 14:58:27 -0500 Subject: [PATCH 633/759] Merge 3d animation node (#10025) --- comfy_extras/nodes_load_3d.py | 110 +++++++--------------------------- 1 file changed, 23 insertions(+), 87 deletions(-) diff --git a/comfy_extras/nodes_load_3d.py b/comfy_extras/nodes_load_3d.py index 899608149..54c66ef68 100644 --- a/comfy_extras/nodes_load_3d.py +++ b/comfy_extras/nodes_load_3d.py @@ -7,6 +7,10 @@ from comfy_api.input_impl import VideoFromFile from pathlib import Path +from PIL import Image +import numpy as np + +import uuid def normalize_path(path): return path.replace('\\', '/') @@ -34,58 +38,6 @@ class Load3D(): "height": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}), }} - RETURN_TYPES = ("IMAGE", "MASK", "STRING", "IMAGE", "IMAGE", "LOAD3D_CAMERA", IO.VIDEO) - RETURN_NAMES = ("image", "mask", "mesh_path", "normal", "lineart", "camera_info", "recording_video") - - FUNCTION = "process" - EXPERIMENTAL = True - - CATEGORY = "3d" - - def process(self, model_file, image, **kwargs): - image_path = folder_paths.get_annotated_filepath(image['image']) - mask_path = folder_paths.get_annotated_filepath(image['mask']) - normal_path = folder_paths.get_annotated_filepath(image['normal']) - lineart_path = folder_paths.get_annotated_filepath(image['lineart']) - - load_image_node = nodes.LoadImage() - output_image, ignore_mask = load_image_node.load_image(image=image_path) - ignore_image, output_mask = load_image_node.load_image(image=mask_path) - normal_image, ignore_mask2 = load_image_node.load_image(image=normal_path) - lineart_image, ignore_mask3 = load_image_node.load_image(image=lineart_path) - - video = None - - if image['recording'] != "": - recording_video_path = folder_paths.get_annotated_filepath(image['recording']) - - video = VideoFromFile(recording_video_path) - - return output_image, output_mask, model_file, normal_image, lineart_image, image['camera_info'], video - -class Load3DAnimation(): - @classmethod - def INPUT_TYPES(s): - input_dir = os.path.join(folder_paths.get_input_directory(), "3d") - - os.makedirs(input_dir, exist_ok=True) - - input_path = Path(input_dir) - base_path = Path(folder_paths.get_input_directory()) - - files = [ - normalize_path(str(file_path.relative_to(base_path))) - for file_path in input_path.rglob("*") - if file_path.suffix.lower() in {'.gltf', '.glb', '.fbx'} - ] - - return {"required": { - "model_file": (sorted(files), {"file_upload": True}), - "image": ("LOAD_3D_ANIMATION", {}), - "width": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}), - "height": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}), - }} - RETURN_TYPES = ("IMAGE", "MASK", "STRING", "IMAGE", "LOAD3D_CAMERA", IO.VIDEO) RETURN_NAMES = ("image", "mask", "mesh_path", "normal", "camera_info", "recording_video") @@ -120,7 +72,8 @@ class Preview3D(): "model_file": ("STRING", {"default": "", "multiline": False}), }, "optional": { - "camera_info": ("LOAD3D_CAMERA", {}) + "camera_info": ("LOAD3D_CAMERA", {}), + "bg_image": ("IMAGE", {}) }} OUTPUT_NODE = True @@ -133,50 +86,33 @@ class Preview3D(): def process(self, model_file, **kwargs): camera_info = kwargs.get("camera_info", None) + bg_image = kwargs.get("bg_image", None) + + bg_image_path = None + if bg_image is not None: + + img_array = (bg_image[0].cpu().numpy() * 255).astype(np.uint8) + img = Image.fromarray(img_array) + + temp_dir = folder_paths.get_temp_directory() + filename = f"bg_{uuid.uuid4().hex}.png" + bg_image_path = os.path.join(temp_dir, filename) + img.save(bg_image_path, compress_level=1) + + bg_image_path = f"temp/{filename}" return { "ui": { - "result": [model_file, camera_info] - } - } - -class Preview3DAnimation(): - @classmethod - def INPUT_TYPES(s): - return {"required": { - "model_file": ("STRING", {"default": "", "multiline": False}), - }, - "optional": { - "camera_info": ("LOAD3D_CAMERA", {}) - }} - - OUTPUT_NODE = True - RETURN_TYPES = () - - CATEGORY = "3d" - - FUNCTION = "process" - EXPERIMENTAL = True - - def process(self, model_file, **kwargs): - camera_info = kwargs.get("camera_info", None) - - return { - "ui": { - "result": [model_file, camera_info] + "result": [model_file, camera_info, bg_image_path] } } NODE_CLASS_MAPPINGS = { "Load3D": Load3D, - "Load3DAnimation": Load3DAnimation, "Preview3D": Preview3D, - "Preview3DAnimation": Preview3DAnimation } NODE_DISPLAY_NAME_MAPPINGS = { - "Load3D": "Load 3D", - "Load3DAnimation": "Load 3D - Animation", - "Preview3D": "Preview 3D", - "Preview3DAnimation": "Preview 3D - Animation" + "Load3D": "Load 3D & Animation", + "Preview3D": "Preview 3D & Animation", } From 55f654db3ddaf5a10ac6dbe79774c23c350d279d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 26 Nov 2025 12:16:40 -0800 Subject: [PATCH 634/759] Fix the CSP offline feature. (#10923) --- server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server.py b/server.py index 0fd2e49e3..fca5050bd 100644 --- a/server.py +++ b/server.py @@ -174,7 +174,7 @@ def create_block_external_middleware(): else: response = await handler(request) - response.headers['Content-Security-Policy'] = "default-src 'self'; script-src 'self' 'unsafe-inline' blob:; style-src 'self' 'unsafe-inline'; img-src 'self' data: blob:; font-src 'self'; connect-src 'self'; frame-src 'self'; object-src 'self';" + response.headers['Content-Security-Policy'] = "default-src 'self'; script-src 'self' 'unsafe-inline' 'unsafe-eval' blob:; style-src 'self' 'unsafe-inline'; img-src 'self' data: blob:; font-src 'self'; connect-src 'self'; frame-src 'self'; object-src 'self';" return response return block_external_middleware From dd41b745497cdbbafb0bd745f590726b0e41f9f3 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 26 Nov 2025 12:36:38 -0800 Subject: [PATCH 635/759] Add Z Image to readme. (#10924) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b9300ab07..91fb510e1 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/) - [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/) - [Flux 2](https://comfyanonymous.github.io/ComfyUI_examples/flux2/) + - [Z Image](https://comfyanonymous.github.io/ComfyUI_examples/z_image/) - Image Editing Models - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/) - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model) From d8433c63fdacef24f40da401b02ebba272bf1fbb Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 27 Nov 2025 00:42:01 +0200 Subject: [PATCH 636/759] chore(api-nodes): remove chat widgets from OpenAI/Gemini nodes (#10861) --- comfy_api_nodes/nodes_gemini.py | 77 +-------------------------------- comfy_api_nodes/nodes_openai.py | 46 ++++---------------- 2 files changed, 11 insertions(+), 112 deletions(-) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 976d9c225..08f7b0f64 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -4,10 +4,7 @@ See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/infer """ import base64 -import json import os -import time -import uuid from enum import Enum from io import BytesIO from typing import Literal @@ -43,7 +40,6 @@ from comfy_api_nodes.util import ( validate_string, video_to_base64_string, ) -from server import PromptServer GEMINI_BASE_ENDPOINT = "/proxy/vertexai/gemini" GEMINI_MAX_INPUT_FILE_SIZE = 20 * 1024 * 1024 # 20 MB @@ -384,29 +380,6 @@ class GeminiNode(IO.ComfyNode): ) output_text = get_text_from_response(response) - if output_text: - # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. - render_spec = { - "node_id": cls.hidden.unique_id, - "component": "ChatHistoryWidget", - "props": { - "history": json.dumps( - [ - { - "prompt": prompt, - "response": output_text, - "response_id": str(uuid.uuid4()), - "timestamp": time.time(), - } - ] - ), - }, - } - PromptServer.instance.send_sync( - "display_component", - render_spec, - ) - return IO.NodeOutput(output_text or "Empty response from Gemini model...") @@ -601,30 +574,7 @@ class GeminiImage(IO.ComfyNode): response_model=GeminiGenerateContentResponse, price_extractor=calculate_tokens_price, ) - - output_text = get_text_from_response(response) - if output_text: - render_spec = { - "node_id": cls.hidden.unique_id, - "component": "ChatHistoryWidget", - "props": { - "history": json.dumps( - [ - { - "prompt": prompt, - "response": output_text, - "response_id": str(uuid.uuid4()), - "timestamp": time.time(), - } - ] - ), - }, - } - PromptServer.instance.send_sync( - "display_component", - render_spec, - ) - return IO.NodeOutput(get_image_from_response(response), output_text) + return IO.NodeOutput(get_image_from_response(response), get_text_from_response(response)) class GeminiImage2(IO.ComfyNode): @@ -744,30 +694,7 @@ class GeminiImage2(IO.ComfyNode): response_model=GeminiGenerateContentResponse, price_extractor=calculate_tokens_price, ) - - output_text = get_text_from_response(response) - if output_text: - render_spec = { - "node_id": cls.hidden.unique_id, - "component": "ChatHistoryWidget", - "props": { - "history": json.dumps( - [ - { - "prompt": prompt, - "response": output_text, - "response_id": str(uuid.uuid4()), - "timestamp": time.time(), - } - ] - ), - }, - } - PromptServer.instance.send_sync( - "display_component", - render_spec, - ) - return IO.NodeOutput(get_image_from_response(response), output_text) + return IO.NodeOutput(get_image_from_response(response), get_text_from_response(response)) class GeminiExtension(ComfyExtension): diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index acf35d276..c8da5464b 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -1,15 +1,10 @@ from io import BytesIO -from typing import Optional, Union -import json import os -import time -import uuid from enum import Enum from inspect import cleandoc import numpy as np import torch from PIL import Image -from server import PromptServer import folder_paths import base64 from comfy_api.latest import IO, ComfyExtension @@ -587,11 +582,11 @@ class OpenAIChatNode(IO.ComfyNode): def create_input_message_contents( cls, prompt: str, - image: Optional[torch.Tensor] = None, - files: Optional[list[InputFileContent]] = None, + image: torch.Tensor | None = None, + files: list[InputFileContent] | None = None, ) -> InputMessageContentList: """Create a list of input message contents from prompt and optional image.""" - content_list: list[Union[InputContent, InputTextContent, InputImageContent, InputFileContent]] = [ + content_list: list[InputContent | InputTextContent | InputImageContent | InputFileContent] = [ InputTextContent(text=prompt, type="input_text"), ] if image is not None: @@ -617,9 +612,9 @@ class OpenAIChatNode(IO.ComfyNode): prompt: str, persist_context: bool = False, model: SupportedOpenAIModel = SupportedOpenAIModel.gpt_5.value, - images: Optional[torch.Tensor] = None, - files: Optional[list[InputFileContent]] = None, - advanced_options: Optional[CreateModelResponseProperties] = None, + images: torch.Tensor | None = None, + files: list[InputFileContent] | None = None, + advanced_options: CreateModelResponseProperties | None = None, ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) @@ -660,30 +655,7 @@ class OpenAIChatNode(IO.ComfyNode): status_extractor=lambda response: response.status, completed_statuses=["incomplete", "completed"] ) - output_text = cls.get_text_from_message_content(cls.get_message_content_from_response(result_response)) - - # Update history - render_spec = { - "node_id": cls.hidden.unique_id, - "component": "ChatHistoryWidget", - "props": { - "history": json.dumps( - [ - { - "prompt": prompt, - "response": output_text, - "response_id": str(uuid.uuid4()), - "timestamp": time.time(), - } - ] - ), - }, - } - PromptServer.instance.send_sync( - "display_component", - render_spec, - ) - return IO.NodeOutput(output_text) + return IO.NodeOutput(cls.get_text_from_message_content(cls.get_message_content_from_response(result_response))) class OpenAIInputFiles(IO.ComfyNode): @@ -790,8 +762,8 @@ class OpenAIChatConfig(IO.ComfyNode): def execute( cls, truncation: bool, - instructions: Optional[str] = None, - max_output_tokens: Optional[int] = None, + instructions: str | None = None, + max_output_tokens: int | None = None, ) -> IO.NodeOutput: """ Configure advanced options for the OpenAI Chat Node. From a2d60aad0f8e03657d501842460123f6eaaf6791 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 27 Nov 2025 00:55:31 +0200 Subject: [PATCH 637/759] convert nodes_customer_sampler.py to V3 schema (#10206) --- comfy_extras/nodes_custom_sampler.py | 1182 ++++++++++++++------------ 1 file changed, 633 insertions(+), 549 deletions(-) diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index d011f433b..fbb080886 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -3,272 +3,312 @@ import comfy.samplers import comfy.sample from comfy.k_diffusion import sampling as k_diffusion_sampling from comfy.k_diffusion import sa_solver -from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict import latent_preview import torch import comfy.utils import node_helpers +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class BasicScheduler: +class BasicScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model": ("MODEL",), - "scheduler": (comfy.samplers.SCHEDULER_NAMES, ), - "steps": ("INT", {"default": 20, "min": 1, "max": 10000}), - "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" + def define_schema(cls): + return io.Schema( + node_id="BasicScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Model.Input("model"), + io.Combo.Input("scheduler", options=comfy.samplers.SCHEDULER_NAMES), + io.Int.Input("steps", default=20, min=1, max=10000), + io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01), + ], + outputs=[io.Sigmas.Output()] + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, model, scheduler, steps, denoise): + @classmethod + def execute(cls, model, scheduler, steps, denoise) -> io.NodeOutput: total_steps = steps if denoise < 1.0: if denoise <= 0.0: - return (torch.FloatTensor([]),) + return io.NodeOutput(torch.FloatTensor([])) total_steps = int(steps/denoise) sigmas = comfy.samplers.calculate_sigmas(model.get_model_object("model_sampling"), scheduler, total_steps).cpu() sigmas = sigmas[-(steps + 1):] - return (sigmas, ) + return io.NodeOutput(sigmas) + + get_sigmas = execute -class KarrasScheduler: +class KarrasScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}), - "sigma_max": ("FLOAT", {"default": 14.614642, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}), - "sigma_min": ("FLOAT", {"default": 0.0291675, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}), - "rho": ("FLOAT", {"default": 7.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" + def define_schema(cls): + return io.Schema( + node_id="KarrasScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Int.Input("steps", default=20, min=1, max=10000), + io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False), + io.Float.Input("sigma_min", default=0.0291675, min=0.0, max=5000.0, step=0.01, round=False), + io.Float.Input("rho", default=7.0, min=0.0, max=100.0, step=0.01, round=False), + ], + outputs=[io.Sigmas.Output()] + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, steps, sigma_max, sigma_min, rho): + @classmethod + def execute(cls, steps, sigma_max, sigma_min, rho) -> io.NodeOutput: sigmas = k_diffusion_sampling.get_sigmas_karras(n=steps, sigma_min=sigma_min, sigma_max=sigma_max, rho=rho) - return (sigmas, ) + return io.NodeOutput(sigmas) -class ExponentialScheduler: + get_sigmas = execute + +class ExponentialScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}), - "sigma_max": ("FLOAT", {"default": 14.614642, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}), - "sigma_min": ("FLOAT", {"default": 0.0291675, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" + def define_schema(cls): + return io.Schema( + node_id="ExponentialScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Int.Input("steps", default=20, min=1, max=10000), + io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False), + io.Float.Input("sigma_min", default=0.0291675, min=0.0, max=5000.0, step=0.01, round=False), + ], + outputs=[io.Sigmas.Output()] + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, steps, sigma_max, sigma_min): + @classmethod + def execute(cls, steps, sigma_max, sigma_min) -> io.NodeOutput: sigmas = k_diffusion_sampling.get_sigmas_exponential(n=steps, sigma_min=sigma_min, sigma_max=sigma_max) - return (sigmas, ) + return io.NodeOutput(sigmas) -class PolyexponentialScheduler: + get_sigmas = execute + +class PolyexponentialScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}), - "sigma_max": ("FLOAT", {"default": 14.614642, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}), - "sigma_min": ("FLOAT", {"default": 0.0291675, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}), - "rho": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" + def define_schema(cls): + return io.Schema( + node_id="PolyexponentialScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Int.Input("steps", default=20, min=1, max=10000), + io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False), + io.Float.Input("sigma_min", default=0.0291675, min=0.0, max=5000.0, step=0.01, round=False), + io.Float.Input("rho", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + ], + outputs=[io.Sigmas.Output()] + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, steps, sigma_max, sigma_min, rho): + @classmethod + def execute(cls, steps, sigma_max, sigma_min, rho) -> io.NodeOutput: sigmas = k_diffusion_sampling.get_sigmas_polyexponential(n=steps, sigma_min=sigma_min, sigma_max=sigma_max, rho=rho) - return (sigmas, ) + return io.NodeOutput(sigmas) -class LaplaceScheduler: + get_sigmas = execute + +class LaplaceScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}), - "sigma_max": ("FLOAT", {"default": 14.614642, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}), - "sigma_min": ("FLOAT", {"default": 0.0291675, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}), - "mu": ("FLOAT", {"default": 0.0, "min": -10.0, "max": 10.0, "step":0.1, "round": False}), - "beta": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 10.0, "step":0.1, "round": False}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" + def define_schema(cls): + return io.Schema( + node_id="LaplaceScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Int.Input("steps", default=20, min=1, max=10000), + io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False), + io.Float.Input("sigma_min", default=0.0291675, min=0.0, max=5000.0, step=0.01, round=False), + io.Float.Input("mu", default=0.0, min=-10.0, max=10.0, step=0.1, round=False), + io.Float.Input("beta", default=0.5, min=0.0, max=10.0, step=0.1, round=False), + ], + outputs=[io.Sigmas.Output()] + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, steps, sigma_max, sigma_min, mu, beta): + @classmethod + def execute(cls, steps, sigma_max, sigma_min, mu, beta) -> io.NodeOutput: sigmas = k_diffusion_sampling.get_sigmas_laplace(n=steps, sigma_min=sigma_min, sigma_max=sigma_max, mu=mu, beta=beta) - return (sigmas, ) + return io.NodeOutput(sigmas) + + get_sigmas = execute -class SDTurboScheduler: +class SDTurboScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model": ("MODEL",), - "steps": ("INT", {"default": 1, "min": 1, "max": 10}), - "denoise": ("FLOAT", {"default": 1.0, "min": 0, "max": 1.0, "step": 0.01}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" + def define_schema(cls): + return io.Schema( + node_id="SDTurboScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Model.Input("model"), + io.Int.Input("steps", default=1, min=1, max=10), + io.Float.Input("denoise", default=1.0, min=0, max=1.0, step=0.01), + ], + outputs=[io.Sigmas.Output()] + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, model, steps, denoise): + @classmethod + def execute(cls, model, steps, denoise) -> io.NodeOutput: start_step = 10 - int(10 * denoise) timesteps = torch.flip(torch.arange(1, 11) * 100 - 1, (0,))[start_step:start_step + steps] sigmas = model.get_model_object("model_sampling").sigma(timesteps) sigmas = torch.cat([sigmas, sigmas.new_zeros([1])]) - return (sigmas, ) + return io.NodeOutput(sigmas) -class BetaSamplingScheduler: + get_sigmas = execute + +class BetaSamplingScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model": ("MODEL",), - "steps": ("INT", {"default": 20, "min": 1, "max": 10000}), - "alpha": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 50.0, "step":0.01, "round": False}), - "beta": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 50.0, "step":0.01, "round": False}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" + def define_schema(cls): + return io.Schema( + node_id="BetaSamplingScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Model.Input("model"), + io.Int.Input("steps", default=20, min=1, max=10000), + io.Float.Input("alpha", default=0.6, min=0.0, max=50.0, step=0.01, round=False), + io.Float.Input("beta", default=0.6, min=0.0, max=50.0, step=0.01, round=False), + ], + outputs=[io.Sigmas.Output()] + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, model, steps, alpha, beta): + @classmethod + def execute(cls, model, steps, alpha, beta) -> io.NodeOutput: sigmas = comfy.samplers.beta_scheduler(model.get_model_object("model_sampling"), steps, alpha=alpha, beta=beta) - return (sigmas, ) + return io.NodeOutput(sigmas) -class VPScheduler: + get_sigmas = execute + +class VPScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}), - "beta_d": ("FLOAT", {"default": 19.9, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}), #TODO: fix default values - "beta_min": ("FLOAT", {"default": 0.1, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}), - "eps_s": ("FLOAT", {"default": 0.001, "min": 0.0, "max": 1.0, "step":0.0001, "round": False}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" + def define_schema(cls): + return io.Schema( + node_id="VPScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Int.Input("steps", default=20, min=1, max=10000), + io.Float.Input("beta_d", default=19.9, min=0.0, max=5000.0, step=0.01, round=False), #TODO: fix default values + io.Float.Input("beta_min", default=0.1, min=0.0, max=5000.0, step=0.01, round=False), + io.Float.Input("eps_s", default=0.001, min=0.0, max=1.0, step=0.0001, round=False), + ], + outputs=[io.Sigmas.Output()] + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, steps, beta_d, beta_min, eps_s): + @classmethod + def execute(cls, steps, beta_d, beta_min, eps_s) -> io.NodeOutput: sigmas = k_diffusion_sampling.get_sigmas_vp(n=steps, beta_d=beta_d, beta_min=beta_min, eps_s=eps_s) - return (sigmas, ) + return io.NodeOutput(sigmas) -class SplitSigmas: + get_sigmas = execute + +class SplitSigmas(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"sigmas": ("SIGMAS", ), - "step": ("INT", {"default": 0, "min": 0, "max": 10000}), - } - } - RETURN_TYPES = ("SIGMAS","SIGMAS") - RETURN_NAMES = ("high_sigmas", "low_sigmas") - CATEGORY = "sampling/custom_sampling/sigmas" + def define_schema(cls): + return io.Schema( + node_id="SplitSigmas", + category="sampling/custom_sampling/sigmas", + inputs=[ + io.Sigmas.Input("sigmas"), + io.Int.Input("step", default=0, min=0, max=10000), + ], + outputs=[ + io.Sigmas.Output(display_name="high_sigmas"), + io.Sigmas.Output(display_name="low_sigmas"), + ] + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, sigmas, step): + @classmethod + def execute(cls, sigmas, step) -> io.NodeOutput: sigmas1 = sigmas[:step + 1] sigmas2 = sigmas[step:] - return (sigmas1, sigmas2) + return io.NodeOutput(sigmas1, sigmas2) -class SplitSigmasDenoise: + get_sigmas = execute + +class SplitSigmasDenoise(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"sigmas": ("SIGMAS", ), - "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - } - } - RETURN_TYPES = ("SIGMAS","SIGMAS") - RETURN_NAMES = ("high_sigmas", "low_sigmas") - CATEGORY = "sampling/custom_sampling/sigmas" + def define_schema(cls): + return io.Schema( + node_id="SplitSigmasDenoise", + category="sampling/custom_sampling/sigmas", + inputs=[ + io.Sigmas.Input("sigmas"), + io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01), + ], + outputs=[ + io.Sigmas.Output(display_name="high_sigmas"), + io.Sigmas.Output(display_name="low_sigmas"), + ] + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, sigmas, denoise): + @classmethod + def execute(cls, sigmas, denoise) -> io.NodeOutput: steps = max(sigmas.shape[-1] - 1, 0) total_steps = round(steps * denoise) sigmas1 = sigmas[:-(total_steps)] sigmas2 = sigmas[-(total_steps + 1):] - return (sigmas1, sigmas2) + return io.NodeOutput(sigmas1, sigmas2) -class FlipSigmas: + get_sigmas = execute + +class FlipSigmas(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"sigmas": ("SIGMAS", ), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/sigmas" + def define_schema(cls): + return io.Schema( + node_id="FlipSigmas", + category="sampling/custom_sampling/sigmas", + inputs=[io.Sigmas.Input("sigmas")], + outputs=[io.Sigmas.Output()] + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, sigmas): + @classmethod + def execute(cls, sigmas) -> io.NodeOutput: if len(sigmas) == 0: - return (sigmas,) + return io.NodeOutput(sigmas) sigmas = sigmas.flip(0) if sigmas[0] == 0: sigmas[0] = 0.0001 - return (sigmas,) + return io.NodeOutput(sigmas) -class SetFirstSigma: + get_sigmas = execute + +class SetFirstSigma(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"sigmas": ("SIGMAS", ), - "sigma": ("FLOAT", {"default": 136.0, "min": 0.0, "max": 20000.0, "step": 0.001, "round": False}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/sigmas" + def define_schema(cls): + return io.Schema( + node_id="SetFirstSigma", + category="sampling/custom_sampling/sigmas", + inputs=[ + io.Sigmas.Input("sigmas"), + io.Float.Input("sigma", default=136.0, min=0.0, max=20000.0, step=0.001, round=False), + ], + outputs=[io.Sigmas.Output()] + ) - FUNCTION = "set_first_sigma" - - def set_first_sigma(self, sigmas, sigma): + @classmethod + def execute(cls, sigmas, sigma) -> io.NodeOutput: sigmas = sigmas.clone() sigmas[0] = sigma - return (sigmas, ) + return io.NodeOutput(sigmas) -class ExtendIntermediateSigmas: + set_first_sigma = execute + +class ExtendIntermediateSigmas(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"sigmas": ("SIGMAS", ), - "steps": ("INT", {"default": 2, "min": 1, "max": 100}), - "start_at_sigma": ("FLOAT", {"default": -1.0, "min": -1.0, "max": 20000.0, "step": 0.01, "round": False}), - "end_at_sigma": ("FLOAT", {"default": 12.0, "min": 0.0, "max": 20000.0, "step": 0.01, "round": False}), - "spacing": (['linear', 'cosine', 'sine'],), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/sigmas" + def define_schema(cls): + return io.Schema( + node_id="ExtendIntermediateSigmas", + category="sampling/custom_sampling/sigmas", + inputs=[ + io.Sigmas.Input("sigmas"), + io.Int.Input("steps", default=2, min=1, max=100), + io.Float.Input("start_at_sigma", default=-1.0, min=-1.0, max=20000.0, step=0.01, round=False), + io.Float.Input("end_at_sigma", default=12.0, min=0.0, max=20000.0, step=0.01, round=False), + io.Combo.Input("spacing", options=['linear', 'cosine', 'sine']), + ], + outputs=[io.Sigmas.Output()] + ) - FUNCTION = "extend" - - def extend(self, sigmas: torch.Tensor, steps: int, start_at_sigma: float, end_at_sigma: float, spacing: str): + @classmethod + def execute(cls, sigmas: torch.Tensor, steps: int, start_at_sigma: float, end_at_sigma: float, spacing: str) -> io.NodeOutput: if start_at_sigma < 0: start_at_sigma = float("inf") @@ -299,27 +339,27 @@ class ExtendIntermediateSigmas: extended_sigmas = torch.FloatTensor(extended_sigmas) - return (extended_sigmas,) + return io.NodeOutput(extended_sigmas) + + extend = execute -class SamplingPercentToSigma: +class SamplingPercentToSigma(io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "model": (IO.MODEL, {}), - "sampling_percent": (IO.FLOAT, {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.0001}), - "return_actual_sigma": (IO.BOOLEAN, {"default": False, "tooltip": "Return the actual sigma value instead of the value used for interval checks.\nThis only affects results at 0.0 and 1.0."}), - } - } + def define_schema(cls): + return io.Schema( + node_id="SamplingPercentToSigma", + category="sampling/custom_sampling/sigmas", + inputs=[ + io.Model.Input("model"), + io.Float.Input("sampling_percent", default=0.0, min=0.0, max=1.0, step=0.0001), + io.Boolean.Input("return_actual_sigma", default=False, tooltip="Return the actual sigma value instead of the value used for interval checks.\nThis only affects results at 0.0 and 1.0."), + ], + outputs=[io.Float.Output(display_name="sigma_value")] + ) - RETURN_TYPES = (IO.FLOAT,) - RETURN_NAMES = ("sigma_value",) - CATEGORY = "sampling/custom_sampling/sigmas" - - FUNCTION = "get_sigma" - - def get_sigma(self, model, sampling_percent, return_actual_sigma): + @classmethod + def execute(cls, model, sampling_percent, return_actual_sigma) -> io.NodeOutput: model_sampling = model.get_model_object("model_sampling") sigma_val = model_sampling.percent_to_sigma(sampling_percent) if return_actual_sigma: @@ -327,212 +367,234 @@ class SamplingPercentToSigma: sigma_val = model_sampling.sigma_max.item() elif sampling_percent == 1.0: sigma_val = model_sampling.sigma_min.item() - return (sigma_val,) + return io.NodeOutput(sigma_val) + + get_sigma = execute -class KSamplerSelect: +class KSamplerSelect(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"sampler_name": (comfy.samplers.SAMPLER_NAMES, ), - } - } - RETURN_TYPES = ("SAMPLER",) - CATEGORY = "sampling/custom_sampling/samplers" + def define_schema(cls): + return io.Schema( + node_id="KSamplerSelect", + category="sampling/custom_sampling/samplers", + inputs=[io.Combo.Input("sampler_name", options=comfy.samplers.SAMPLER_NAMES)], + outputs=[io.Sampler.Output()] + ) - FUNCTION = "get_sampler" - - def get_sampler(self, sampler_name): + @classmethod + def execute(cls, sampler_name) -> io.NodeOutput: sampler = comfy.samplers.sampler_object(sampler_name) - return (sampler, ) + return io.NodeOutput(sampler) -class SamplerDPMPP_3M_SDE: + get_sampler = execute + +class SamplerDPMPP_3M_SDE(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "noise_device": (['gpu', 'cpu'], ), - } - } - RETURN_TYPES = ("SAMPLER",) - CATEGORY = "sampling/custom_sampling/samplers" + def define_schema(cls): + return io.Schema( + node_id="SamplerDPMPP_3M_SDE", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + io.Combo.Input("noise_device", options=['gpu', 'cpu']), + ], + outputs=[io.Sampler.Output()] + ) - FUNCTION = "get_sampler" - - def get_sampler(self, eta, s_noise, noise_device): + @classmethod + def execute(cls, eta, s_noise, noise_device) -> io.NodeOutput: if noise_device == 'cpu': sampler_name = "dpmpp_3m_sde" else: sampler_name = "dpmpp_3m_sde_gpu" sampler = comfy.samplers.ksampler(sampler_name, {"eta": eta, "s_noise": s_noise}) - return (sampler, ) + return io.NodeOutput(sampler) -class SamplerDPMPP_2M_SDE: + get_sampler = execute + +class SamplerDPMPP_2M_SDE(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"solver_type": (['midpoint', 'heun'], ), - "eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "noise_device": (['gpu', 'cpu'], ), - } - } - RETURN_TYPES = ("SAMPLER",) - CATEGORY = "sampling/custom_sampling/samplers" + def define_schema(cls): + return io.Schema( + node_id="SamplerDPMPP_2M_SDE", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Combo.Input("solver_type", options=['midpoint', 'heun']), + io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + io.Combo.Input("noise_device", options=['gpu', 'cpu']), + ], + outputs=[io.Sampler.Output()] + ) - FUNCTION = "get_sampler" - - def get_sampler(self, solver_type, eta, s_noise, noise_device): + @classmethod + def execute(cls, solver_type, eta, s_noise, noise_device) -> io.NodeOutput: if noise_device == 'cpu': sampler_name = "dpmpp_2m_sde" else: sampler_name = "dpmpp_2m_sde_gpu" sampler = comfy.samplers.ksampler(sampler_name, {"eta": eta, "s_noise": s_noise, "solver_type": solver_type}) - return (sampler, ) + return io.NodeOutput(sampler) + + get_sampler = execute -class SamplerDPMPP_SDE: +class SamplerDPMPP_SDE(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "r": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "noise_device": (['gpu', 'cpu'], ), - } - } - RETURN_TYPES = ("SAMPLER",) - CATEGORY = "sampling/custom_sampling/samplers" + def define_schema(cls): + return io.Schema( + node_id="SamplerDPMPP_SDE", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("r", default=0.5, min=0.0, max=100.0, step=0.01, round=False), + io.Combo.Input("noise_device", options=['gpu', 'cpu']), + ], + outputs=[io.Sampler.Output()] + ) - FUNCTION = "get_sampler" - - def get_sampler(self, eta, s_noise, r, noise_device): + @classmethod + def execute(cls, eta, s_noise, r, noise_device) -> io.NodeOutput: if noise_device == 'cpu': sampler_name = "dpmpp_sde" else: sampler_name = "dpmpp_sde_gpu" sampler = comfy.samplers.ksampler(sampler_name, {"eta": eta, "s_noise": s_noise, "r": r}) - return (sampler, ) + return io.NodeOutput(sampler) -class SamplerDPMPP_2S_Ancestral: + get_sampler = execute + +class SamplerDPMPP_2S_Ancestral(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - } - } - RETURN_TYPES = ("SAMPLER",) - CATEGORY = "sampling/custom_sampling/samplers" + def define_schema(cls): + return io.Schema( + node_id="SamplerDPMPP_2S_Ancestral", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + ], + outputs=[io.Sampler.Output()] + ) - FUNCTION = "get_sampler" - - def get_sampler(self, eta, s_noise): + @classmethod + def execute(cls, eta, s_noise) -> io.NodeOutput: sampler = comfy.samplers.ksampler("dpmpp_2s_ancestral", {"eta": eta, "s_noise": s_noise}) - return (sampler, ) + return io.NodeOutput(sampler) -class SamplerEulerAncestral: + get_sampler = execute + +class SamplerEulerAncestral(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - } - } - RETURN_TYPES = ("SAMPLER",) - CATEGORY = "sampling/custom_sampling/samplers" + def define_schema(cls): + return io.Schema( + node_id="SamplerEulerAncestral", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + ], + outputs=[io.Sampler.Output()] + ) - FUNCTION = "get_sampler" - - def get_sampler(self, eta, s_noise): + @classmethod + def execute(cls, eta, s_noise) -> io.NodeOutput: sampler = comfy.samplers.ksampler("euler_ancestral", {"eta": eta, "s_noise": s_noise}) - return (sampler, ) + return io.NodeOutput(sampler) -class SamplerEulerAncestralCFGPP: + get_sampler = execute + +class SamplerEulerAncestralCFGPP(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step":0.01, "round": False}), - "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step":0.01, "round": False}), - }} - RETURN_TYPES = ("SAMPLER",) - CATEGORY = "sampling/custom_sampling/samplers" + def define_schema(cls): + return io.Schema( + node_id="SamplerEulerAncestralCFGPP", + display_name="SamplerEulerAncestralCFG++", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Float.Input("eta", default=1.0, min=0.0, max=1.0, step=0.01, round=False), + io.Float.Input("s_noise", default=1.0, min=0.0, max=10.0, step=0.01, round=False), + ], + outputs=[io.Sampler.Output()] + ) - FUNCTION = "get_sampler" - - def get_sampler(self, eta, s_noise): + @classmethod + def execute(cls, eta, s_noise) -> io.NodeOutput: sampler = comfy.samplers.ksampler( "euler_ancestral_cfg_pp", {"eta": eta, "s_noise": s_noise}) - return (sampler, ) + return io.NodeOutput(sampler) -class SamplerLMS: + get_sampler = execute + +class SamplerLMS(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"order": ("INT", {"default": 4, "min": 1, "max": 100}), - } - } - RETURN_TYPES = ("SAMPLER",) - CATEGORY = "sampling/custom_sampling/samplers" + def define_schema(cls): + return io.Schema( + node_id="SamplerLMS", + category="sampling/custom_sampling/samplers", + inputs=[io.Int.Input("order", default=4, min=1, max=100)], + outputs=[io.Sampler.Output()] + ) - FUNCTION = "get_sampler" - - def get_sampler(self, order): + @classmethod + def execute(cls, order) -> io.NodeOutput: sampler = comfy.samplers.ksampler("lms", {"order": order}) - return (sampler, ) + return io.NodeOutput(sampler) -class SamplerDPMAdaptative: + get_sampler = execute + +class SamplerDPMAdaptative(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"order": ("INT", {"default": 3, "min": 2, "max": 3}), - "rtol": ("FLOAT", {"default": 0.05, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "atol": ("FLOAT", {"default": 0.0078, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "h_init": ("FLOAT", {"default": 0.05, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "pcoeff": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "icoeff": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "dcoeff": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "accept_safety": ("FLOAT", {"default": 0.81, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "eta": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}), - } - } - RETURN_TYPES = ("SAMPLER",) - CATEGORY = "sampling/custom_sampling/samplers" + def define_schema(cls): + return io.Schema( + node_id="SamplerDPMAdaptative", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Int.Input("order", default=3, min=2, max=3), + io.Float.Input("rtol", default=0.05, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("atol", default=0.0078, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("h_init", default=0.05, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("pcoeff", default=0.0, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("icoeff", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("dcoeff", default=0.0, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("accept_safety", default=0.81, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("eta", default=0.0, min=0.0, max=100.0, step=0.01, round=False), + io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + ], + outputs=[io.Sampler.Output()] + ) - FUNCTION = "get_sampler" - - def get_sampler(self, order, rtol, atol, h_init, pcoeff, icoeff, dcoeff, accept_safety, eta, s_noise): + @classmethod + def execute(cls, order, rtol, atol, h_init, pcoeff, icoeff, dcoeff, accept_safety, eta, s_noise) -> io.NodeOutput: sampler = comfy.samplers.ksampler("dpm_adaptive", {"order": order, "rtol": rtol, "atol": atol, "h_init": h_init, "pcoeff": pcoeff, "icoeff": icoeff, "dcoeff": dcoeff, "accept_safety": accept_safety, "eta": eta, "s_noise":s_noise }) - return (sampler, ) + return io.NodeOutput(sampler) + + get_sampler = execute -class SamplerER_SDE(ComfyNodeABC): +class SamplerER_SDE(io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "solver_type": (IO.COMBO, {"options": ["ER-SDE", "Reverse-time SDE", "ODE"]}), - "max_stage": (IO.INT, {"default": 3, "min": 1, "max": 3}), - "eta": ( - IO.FLOAT, - {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": False, "tooltip": "Stochastic strength of reverse-time SDE.\nWhen eta=0, it reduces to deterministic ODE. This setting doesn't apply to ER-SDE solver type."}, - ), - "s_noise": (IO.FLOAT, {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": False}), - } - } + def define_schema(cls): + return io.Schema( + node_id="SamplerER_SDE", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Combo.Input("solver_type", options=["ER-SDE", "Reverse-time SDE", "ODE"]), + io.Int.Input("max_stage", default=3, min=1, max=3), + io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, tooltip="Stochastic strength of reverse-time SDE.\nWhen eta=0, it reduces to deterministic ODE. This setting doesn't apply to ER-SDE solver type."), + io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + ], + outputs=[io.Sampler.Output()] + ) - RETURN_TYPES = (IO.SAMPLER,) - CATEGORY = "sampling/custom_sampling/samplers" - - FUNCTION = "get_sampler" - - def get_sampler(self, solver_type, max_stage, eta, s_noise): + @classmethod + def execute(cls, solver_type, max_stage, eta, s_noise) -> io.NodeOutput: if solver_type == "ODE" or (solver_type == "Reverse-time SDE" and eta == 0): eta = 0 s_noise = 0 @@ -548,32 +610,33 @@ class SamplerER_SDE(ComfyNodeABC): sampler_name = "er_sde" sampler = comfy.samplers.ksampler(sampler_name, {"s_noise": s_noise, "noise_scaler": noise_scaler, "max_stage": max_stage}) - return (sampler,) + return io.NodeOutput(sampler) + + get_sampler = execute -class SamplerSASolver(ComfyNodeABC): +class SamplerSASolver(io.ComfyNode): @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "model": (IO.MODEL, {}), - "eta": (IO.FLOAT, {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01, "round": False},), - "sde_start_percent": (IO.FLOAT, {"default": 0.2, "min": 0.0, "max": 1.0, "step": 0.001},), - "sde_end_percent": (IO.FLOAT, {"default": 0.8, "min": 0.0, "max": 1.0, "step": 0.001},), - "s_noise": (IO.FLOAT, {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": False},), - "predictor_order": (IO.INT, {"default": 3, "min": 1, "max": 6}), - "corrector_order": (IO.INT, {"default": 4, "min": 0, "max": 6}), - "use_pece": (IO.BOOLEAN, {}), - "simple_order_2": (IO.BOOLEAN, {}), - } - } + def define_schema(cls): + return io.Schema( + node_id="SamplerSASolver", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Model.Input("model"), + io.Float.Input("eta", default=1.0, min=0.0, max=10.0, step=0.01, round=False), + io.Float.Input("sde_start_percent", default=0.2, min=0.0, max=1.0, step=0.001), + io.Float.Input("sde_end_percent", default=0.8, min=0.0, max=1.0, step=0.001), + io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False), + io.Int.Input("predictor_order", default=3, min=1, max=6), + io.Int.Input("corrector_order", default=4, min=0, max=6), + io.Boolean.Input("use_pece"), + io.Boolean.Input("simple_order_2"), + ], + outputs=[io.Sampler.Output()] + ) - RETURN_TYPES = (IO.SAMPLER,) - CATEGORY = "sampling/custom_sampling/samplers" - - FUNCTION = "get_sampler" - - def get_sampler(self, model, eta, sde_start_percent, sde_end_percent, s_noise, predictor_order, corrector_order, use_pece, simple_order_2): + @classmethod + def execute(cls, model, eta, sde_start_percent, sde_end_percent, s_noise, predictor_order, corrector_order, use_pece, simple_order_2) -> io.NodeOutput: model_sampling = model.get_model_object("model_sampling") start_sigma = model_sampling.percent_to_sigma(sde_start_percent) end_sigma = model_sampling.percent_to_sigma(sde_end_percent) @@ -591,7 +654,9 @@ class SamplerSASolver(ComfyNodeABC): "simple_order_2": simple_order_2, }, ) - return (sampler,) + return io.NodeOutput(sampler) + + get_sampler = execute class Noise_EmptyNoise: @@ -612,30 +677,31 @@ class Noise_RandomNoise: batch_inds = input_latent["batch_index"] if "batch_index" in input_latent else None return comfy.sample.prepare_noise(latent_image, self.seed, batch_inds) -class SamplerCustom: +class SamplerCustom(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model": ("MODEL",), - "add_noise": ("BOOLEAN", {"default": True}), - "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff, "control_after_generate": True}), - "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}), - "positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "sampler": ("SAMPLER", ), - "sigmas": ("SIGMAS", ), - "latent_image": ("LATENT", ), - } - } + def define_schema(cls): + return io.Schema( + node_id="SamplerCustom", + category="sampling/custom_sampling", + inputs=[ + io.Model.Input("model"), + io.Boolean.Input("add_noise", default=True), + io.Int.Input("noise_seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True), + io.Float.Input("cfg", default=8.0, min=0.0, max=100.0, step=0.1, round=0.01), + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Sampler.Input("sampler"), + io.Sigmas.Input("sigmas"), + io.Latent.Input("latent_image"), + ], + outputs=[ + io.Latent.Output(display_name="output"), + io.Latent.Output(display_name="denoised_output"), + ] + ) - RETURN_TYPES = ("LATENT","LATENT") - RETURN_NAMES = ("output", "denoised_output") - - FUNCTION = "sample" - - CATEGORY = "sampling/custom_sampling" - - def sample(self, model, add_noise, noise_seed, cfg, positive, negative, sampler, sigmas, latent_image): + @classmethod + def execute(cls, model, add_noise, noise_seed, cfg, positive, negative, sampler, sigmas, latent_image) -> io.NodeOutput: latent = latent_image latent_image = latent["samples"] latent = latent.copy() @@ -664,52 +730,58 @@ class SamplerCustom: out_denoised["samples"] = model.model.process_latent_out(x0_output["x0"].cpu()) else: out_denoised = out - return (out, out_denoised) + return io.NodeOutput(out, out_denoised) + + sample = execute class Guider_Basic(comfy.samplers.CFGGuider): def set_conds(self, positive): self.inner_set_conds({"positive": positive}) -class BasicGuider: +class BasicGuider(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model": ("MODEL",), - "conditioning": ("CONDITIONING", ), - } - } + def define_schema(cls): + return io.Schema( + node_id="BasicGuider", + category="sampling/custom_sampling/guiders", + inputs=[ + io.Model.Input("model"), + io.Conditioning.Input("conditioning"), + ], + outputs=[io.Guider.Output()] + ) - RETURN_TYPES = ("GUIDER",) - - FUNCTION = "get_guider" - CATEGORY = "sampling/custom_sampling/guiders" - - def get_guider(self, model, conditioning): + @classmethod + def execute(cls, model, conditioning) -> io.NodeOutput: guider = Guider_Basic(model) guider.set_conds(conditioning) - return (guider,) + return io.NodeOutput(guider) -class CFGGuider: + get_guider = execute + +class CFGGuider(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model": ("MODEL",), - "positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}), - } - } + def define_schema(cls): + return io.Schema( + node_id="CFGGuider", + category="sampling/custom_sampling/guiders", + inputs=[ + io.Model.Input("model"), + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Float.Input("cfg", default=8.0, min=0.0, max=100.0, step=0.1, round=0.01), + ], + outputs=[io.Guider.Output()] + ) - RETURN_TYPES = ("GUIDER",) - - FUNCTION = "get_guider" - CATEGORY = "sampling/custom_sampling/guiders" - - def get_guider(self, model, positive, negative, cfg): + @classmethod + def execute(cls, model, positive, negative, cfg) -> io.NodeOutput: guider = comfy.samplers.CFGGuider(model) guider.set_conds(positive, negative) guider.set_cfg(cfg) - return (guider,) + return io.NodeOutput(guider) + + get_guider = execute class Guider_DualCFG(comfy.samplers.CFGGuider): def set_cfg(self, cfg1, cfg2, nested=False): @@ -740,84 +812,88 @@ class Guider_DualCFG(comfy.samplers.CFGGuider): out = comfy.samplers.calc_cond_batch(self.inner_model, [negative_cond, middle_cond, positive_cond], x, timestep, model_options) return comfy.samplers.cfg_function(self.inner_model, out[1], out[0], self.cfg2, x, timestep, model_options=model_options, cond=middle_cond, uncond=negative_cond) + (out[2] - out[1]) * self.cfg1 -class DualCFGGuider: +class DualCFGGuider(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model": ("MODEL",), - "cond1": ("CONDITIONING", ), - "cond2": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "cfg_conds": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}), - "cfg_cond2_negative": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}), - "style": (["regular", "nested"],), - } - } + def define_schema(cls): + return io.Schema( + node_id="DualCFGGuider", + category="sampling/custom_sampling/guiders", + inputs=[ + io.Model.Input("model"), + io.Conditioning.Input("cond1"), + io.Conditioning.Input("cond2"), + io.Conditioning.Input("negative"), + io.Float.Input("cfg_conds", default=8.0, min=0.0, max=100.0, step=0.1, round=0.01), + io.Float.Input("cfg_cond2_negative", default=8.0, min=0.0, max=100.0, step=0.1, round=0.01), + io.Combo.Input("style", options=["regular", "nested"]), + ], + outputs=[io.Guider.Output()] + ) - RETURN_TYPES = ("GUIDER",) - - FUNCTION = "get_guider" - CATEGORY = "sampling/custom_sampling/guiders" - - def get_guider(self, model, cond1, cond2, negative, cfg_conds, cfg_cond2_negative, style): + @classmethod + def execute(cls, model, cond1, cond2, negative, cfg_conds, cfg_cond2_negative, style) -> io.NodeOutput: guider = Guider_DualCFG(model) guider.set_conds(cond1, cond2, negative) guider.set_cfg(cfg_conds, cfg_cond2_negative, nested=(style == "nested")) - return (guider,) + return io.NodeOutput(guider) -class DisableNoise: + get_guider = execute + +class DisableNoise(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required":{ - } - } + def define_schema(cls): + return io.Schema( + node_id="DisableNoise", + category="sampling/custom_sampling/noise", + inputs=[], + outputs=[io.Noise.Output()] + ) - RETURN_TYPES = ("NOISE",) - FUNCTION = "get_noise" - CATEGORY = "sampling/custom_sampling/noise" - - def get_noise(self): - return (Noise_EmptyNoise(),) - - -class RandomNoise(DisableNoise): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "noise_seed": ("INT", { - "default": 0, - "min": 0, - "max": 0xffffffffffffffff, - "control_after_generate": True, - }), - } - } + def execute(cls) -> io.NodeOutput: + return io.NodeOutput(Noise_EmptyNoise()) - def get_noise(self, noise_seed): - return (Noise_RandomNoise(noise_seed),) + get_noise = execute -class SamplerCustomAdvanced: +class RandomNoise(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"noise": ("NOISE", ), - "guider": ("GUIDER", ), - "sampler": ("SAMPLER", ), - "sigmas": ("SIGMAS", ), - "latent_image": ("LATENT", ), - } - } + def define_schema(cls): + return io.Schema( + node_id="RandomNoise", + category="sampling/custom_sampling/noise", + inputs=[io.Int.Input("noise_seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True)], + outputs=[io.Noise.Output()] + ) - RETURN_TYPES = ("LATENT","LATENT") - RETURN_NAMES = ("output", "denoised_output") + @classmethod + def execute(cls, noise_seed) -> io.NodeOutput: + return io.NodeOutput(Noise_RandomNoise(noise_seed)) - FUNCTION = "sample" + get_noise = execute - CATEGORY = "sampling/custom_sampling" - def sample(self, noise, guider, sampler, sigmas, latent_image): +class SamplerCustomAdvanced(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SamplerCustomAdvanced", + category="sampling/custom_sampling", + inputs=[ + io.Noise.Input("noise"), + io.Guider.Input("guider"), + io.Sampler.Input("sampler"), + io.Sigmas.Input("sigmas"), + io.Latent.Input("latent_image"), + ], + outputs=[ + io.Latent.Output(display_name="output"), + io.Latent.Output(display_name="denoised_output"), + ] + ) + + @classmethod + def execute(cls, noise, guider, sampler, sigmas, latent_image) -> io.NodeOutput: latent = latent_image latent_image = latent["samples"] latent = latent.copy() @@ -842,28 +918,32 @@ class SamplerCustomAdvanced: out_denoised["samples"] = guider.model_patcher.model.process_latent_out(x0_output["x0"].cpu()) else: out_denoised = out - return (out, out_denoised) + return io.NodeOutput(out, out_denoised) -class AddNoise: + sample = execute + +class AddNoise(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model": ("MODEL",), - "noise": ("NOISE", ), - "sigmas": ("SIGMAS", ), - "latent_image": ("LATENT", ), - } - } + def define_schema(cls): + return io.Schema( + node_id="AddNoise", + category="_for_testing/custom_sampling/noise", + is_experimental=True, + inputs=[ + io.Model.Input("model"), + io.Noise.Input("noise"), + io.Sigmas.Input("sigmas"), + io.Latent.Input("latent_image"), + ], + outputs=[ + io.Latent.Output(), + ] + ) - RETURN_TYPES = ("LATENT",) - - FUNCTION = "add_noise" - - CATEGORY = "_for_testing/custom_sampling/noise" - - def add_noise(self, model, noise, sigmas, latent_image): + @classmethod + def execute(cls, model, noise, sigmas, latent_image) -> io.NodeOutput: if len(sigmas) == 0: - return latent_image + return io.NodeOutput(latent_image) latent = latent_image latent_image = latent["samples"] @@ -887,46 +967,50 @@ class AddNoise: out = latent.copy() out["samples"] = noisy - return (out,) + return io.NodeOutput(out) + + add_noise = execute -NODE_CLASS_MAPPINGS = { - "SamplerCustom": SamplerCustom, - "BasicScheduler": BasicScheduler, - "KarrasScheduler": KarrasScheduler, - "ExponentialScheduler": ExponentialScheduler, - "PolyexponentialScheduler": PolyexponentialScheduler, - "LaplaceScheduler": LaplaceScheduler, - "VPScheduler": VPScheduler, - "BetaSamplingScheduler": BetaSamplingScheduler, - "SDTurboScheduler": SDTurboScheduler, - "KSamplerSelect": KSamplerSelect, - "SamplerEulerAncestral": SamplerEulerAncestral, - "SamplerEulerAncestralCFGPP": SamplerEulerAncestralCFGPP, - "SamplerLMS": SamplerLMS, - "SamplerDPMPP_3M_SDE": SamplerDPMPP_3M_SDE, - "SamplerDPMPP_2M_SDE": SamplerDPMPP_2M_SDE, - "SamplerDPMPP_SDE": SamplerDPMPP_SDE, - "SamplerDPMPP_2S_Ancestral": SamplerDPMPP_2S_Ancestral, - "SamplerDPMAdaptative": SamplerDPMAdaptative, - "SamplerER_SDE": SamplerER_SDE, - "SamplerSASolver": SamplerSASolver, - "SplitSigmas": SplitSigmas, - "SplitSigmasDenoise": SplitSigmasDenoise, - "FlipSigmas": FlipSigmas, - "SetFirstSigma": SetFirstSigma, - "ExtendIntermediateSigmas": ExtendIntermediateSigmas, - "SamplingPercentToSigma": SamplingPercentToSigma, +class CustomSamplersExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + SamplerCustom, + BasicScheduler, + KarrasScheduler, + ExponentialScheduler, + PolyexponentialScheduler, + LaplaceScheduler, + VPScheduler, + BetaSamplingScheduler, + SDTurboScheduler, + KSamplerSelect, + SamplerEulerAncestral, + SamplerEulerAncestralCFGPP, + SamplerLMS, + SamplerDPMPP_3M_SDE, + SamplerDPMPP_2M_SDE, + SamplerDPMPP_SDE, + SamplerDPMPP_2S_Ancestral, + SamplerDPMAdaptative, + SamplerER_SDE, + SamplerSASolver, + SplitSigmas, + SplitSigmasDenoise, + FlipSigmas, + SetFirstSigma, + ExtendIntermediateSigmas, + SamplingPercentToSigma, + CFGGuider, + DualCFGGuider, + BasicGuider, + RandomNoise, + DisableNoise, + AddNoise, + SamplerCustomAdvanced, + ] - "CFGGuider": CFGGuider, - "DualCFGGuider": DualCFGGuider, - "BasicGuider": BasicGuider, - "RandomNoise": RandomNoise, - "DisableNoise": DisableNoise, - "AddNoise": AddNoise, - "SamplerCustomAdvanced": SamplerCustomAdvanced, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "SamplerEulerAncestralCFGPP": "SamplerEulerAncestralCFG++", -} +async def comfy_entrypoint() -> CustomSamplersExtension: + return CustomSamplersExtension() From cc6a8dcd1ad9cc9ef7602ee141174a0cea0ed4ce Mon Sep 17 00:00:00 2001 From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> Date: Thu, 27 Nov 2025 08:18:08 +0800 Subject: [PATCH 638/759] Dataset Processing Nodes and Improved LoRA Trainer Nodes with multi resolution supports. (#10708) * Create nodes_dataset.py * Add encoded dataset caching mechanism * make training node to work with our dataset system * allow trainer node to get different resolution dataset * move all dataset related implementation to nodes_dataset * Rewrite dataset system with new io schema * Rewrite training system with new io schema * add ui pbar * Add outputs' id/name * Fix bad id/naming * use single process instead of input list when no need * fix wrong output_list flag * use torch.load/save and fix bad behaviors --- comfy_extras/nodes_dataset.py | 1532 +++++++++++++++++++++++++++++++++ comfy_extras/nodes_train.py | 967 ++++++++++----------- nodes.py | 1 + 3 files changed, 1980 insertions(+), 520 deletions(-) create mode 100644 comfy_extras/nodes_dataset.py diff --git a/comfy_extras/nodes_dataset.py b/comfy_extras/nodes_dataset.py new file mode 100644 index 000000000..b23867505 --- /dev/null +++ b/comfy_extras/nodes_dataset.py @@ -0,0 +1,1532 @@ +import logging +import os +import math +import json + +import numpy as np +import torch +from PIL import Image +from typing_extensions import override + +import folder_paths +import node_helpers +from comfy_api.latest import ComfyExtension, io + + +def load_and_process_images(image_files, input_dir): + """Utility function to load and process a list of images. + + Args: + image_files: List of image filenames + input_dir: Base directory containing the images + resize_method: How to handle images of different sizes ("None", "Stretch", "Crop", "Pad") + + Returns: + torch.Tensor: Batch of processed images + """ + if not image_files: + raise ValueError("No valid images found in input") + + output_images = [] + + for file in image_files: + image_path = os.path.join(input_dir, file) + img = node_helpers.pillow(Image.open, image_path) + + if img.mode == "I": + img = img.point(lambda i: i * (1 / 255)) + img = img.convert("RGB") + img_array = np.array(img).astype(np.float32) / 255.0 + img_tensor = torch.from_numpy(img_array)[None,] + output_images.append(img_tensor) + + return output_images + + +class LoadImageDataSetFromFolderNode(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="LoadImageDataSetFromFolder", + display_name="Load Image Dataset from Folder", + category="dataset", + is_experimental=True, + inputs=[ + io.Combo.Input( + "folder", + options=folder_paths.get_input_subfolders(), + tooltip="The folder to load images from.", + ) + ], + outputs=[ + io.Image.Output( + display_name="images", + is_output_list=True, + tooltip="List of loaded images", + ) + ], + ) + + @classmethod + def execute(cls, folder): + sub_input_dir = os.path.join(folder_paths.get_input_directory(), folder) + valid_extensions = [".png", ".jpg", ".jpeg", ".webp"] + image_files = [ + f + for f in os.listdir(sub_input_dir) + if any(f.lower().endswith(ext) for ext in valid_extensions) + ] + output_tensor = load_and_process_images(image_files, sub_input_dir) + return io.NodeOutput(output_tensor) + + +class LoadImageTextDataSetFromFolderNode(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="LoadImageTextDataSetFromFolder", + display_name="Load Image and Text Dataset from Folder", + category="dataset", + is_experimental=True, + inputs=[ + io.Combo.Input( + "folder", + options=folder_paths.get_input_subfolders(), + tooltip="The folder to load images from.", + ) + ], + outputs=[ + io.Image.Output( + display_name="images", + is_output_list=True, + tooltip="List of loaded images", + ), + io.String.Output( + display_name="texts", + is_output_list=True, + tooltip="List of text captions", + ), + ], + ) + + @classmethod + def execute(cls, folder): + logging.info(f"Loading images from folder: {folder}") + + sub_input_dir = os.path.join(folder_paths.get_input_directory(), folder) + valid_extensions = [".png", ".jpg", ".jpeg", ".webp"] + + image_files = [] + for item in os.listdir(sub_input_dir): + path = os.path.join(sub_input_dir, item) + if any(item.lower().endswith(ext) for ext in valid_extensions): + image_files.append(path) + elif os.path.isdir(path): + # Support kohya-ss/sd-scripts folder structure + repeat = 1 + if item.split("_")[0].isdigit(): + repeat = int(item.split("_")[0]) + image_files.extend( + [ + os.path.join(path, f) + for f in os.listdir(path) + if any(f.lower().endswith(ext) for ext in valid_extensions) + ] + * repeat + ) + + caption_file_path = [ + f.replace(os.path.splitext(f)[1], ".txt") for f in image_files + ] + captions = [] + for caption_file in caption_file_path: + caption_path = os.path.join(sub_input_dir, caption_file) + if os.path.exists(caption_path): + with open(caption_path, "r", encoding="utf-8") as f: + caption = f.read().strip() + captions.append(caption) + else: + captions.append("") + + output_tensor = load_and_process_images(image_files, sub_input_dir) + + logging.info(f"Loaded {len(output_tensor)} images from {sub_input_dir}.") + return io.NodeOutput(output_tensor, captions) + + +def save_images_to_folder(image_list, output_dir, prefix="image"): + """Utility function to save a list of image tensors to disk. + + Args: + image_list: List of image tensors (each [1, H, W, C] or [H, W, C] or [C, H, W]) + output_dir: Directory to save images to + prefix: Filename prefix + + Returns: + List of saved filenames + """ + os.makedirs(output_dir, exist_ok=True) + saved_files = [] + + for idx, img_tensor in enumerate(image_list): + # Handle different tensor shapes + if isinstance(img_tensor, torch.Tensor): + # Remove batch dimension if present [1, H, W, C] -> [H, W, C] + if img_tensor.dim() == 4 and img_tensor.shape[0] == 1: + img_tensor = img_tensor.squeeze(0) + + # If tensor is [C, H, W], permute to [H, W, C] + if img_tensor.dim() == 3 and img_tensor.shape[0] in [1, 3, 4]: + if ( + img_tensor.shape[0] <= 4 + and img_tensor.shape[1] > 4 + and img_tensor.shape[2] > 4 + ): + img_tensor = img_tensor.permute(1, 2, 0) + + # Convert to numpy and scale to 0-255 + img_array = img_tensor.cpu().numpy() + img_array = np.clip(img_array * 255.0, 0, 255).astype(np.uint8) + + # Convert to PIL Image + img = Image.fromarray(img_array) + else: + raise ValueError(f"Expected torch.Tensor, got {type(img_tensor)}") + + # Save image + filename = f"{prefix}_{idx:05d}.png" + filepath = os.path.join(output_dir, filename) + img.save(filepath) + saved_files.append(filename) + + return saved_files + + +class SaveImageDataSetToFolderNode(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SaveImageDataSetToFolder", + display_name="Save Image Dataset to Folder", + category="dataset", + is_experimental=True, + is_output_node=True, + is_input_list=True, # Receive images as list + inputs=[ + io.Image.Input("images", tooltip="List of images to save."), + io.String.Input( + "folder_name", + default="dataset", + tooltip="Name of the folder to save images to (inside output directory).", + ), + io.String.Input( + "filename_prefix", + default="image", + tooltip="Prefix for saved image filenames.", + ), + ], + outputs=[], + ) + + @classmethod + def execute(cls, images, folder_name, filename_prefix): + # Extract scalar values + folder_name = folder_name[0] + filename_prefix = filename_prefix[0] + + output_dir = os.path.join(folder_paths.get_output_directory(), folder_name) + saved_files = save_images_to_folder(images, output_dir, filename_prefix) + + logging.info(f"Saved {len(saved_files)} images to {output_dir}.") + return io.NodeOutput() + + +class SaveImageTextDataSetToFolderNode(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SaveImageTextDataSetToFolder", + display_name="Save Image and Text Dataset to Folder", + category="dataset", + is_experimental=True, + is_output_node=True, + is_input_list=True, # Receive both images and texts as lists + inputs=[ + io.Image.Input("images", tooltip="List of images to save."), + io.String.Input("texts", tooltip="List of text captions to save."), + io.String.Input( + "folder_name", + default="dataset", + tooltip="Name of the folder to save images to (inside output directory).", + ), + io.String.Input( + "filename_prefix", + default="image", + tooltip="Prefix for saved image filenames.", + ), + ], + outputs=[], + ) + + @classmethod + def execute(cls, images, texts, folder_name, filename_prefix): + # Extract scalar values + folder_name = folder_name[0] + filename_prefix = filename_prefix[0] + + output_dir = os.path.join(folder_paths.get_output_directory(), folder_name) + saved_files = save_images_to_folder(images, output_dir, filename_prefix) + + # Save captions + for idx, (filename, caption) in enumerate(zip(saved_files, texts)): + caption_filename = filename.replace(".png", ".txt") + caption_path = os.path.join(output_dir, caption_filename) + with open(caption_path, "w", encoding="utf-8") as f: + f.write(caption) + + logging.info(f"Saved {len(saved_files)} images and captions to {output_dir}.") + return io.NodeOutput() + + +# ========== Helper Functions for Transform Nodes ========== + + +def tensor_to_pil(img_tensor): + """Convert tensor to PIL Image.""" + if img_tensor.dim() == 4 and img_tensor.shape[0] == 1: + img_tensor = img_tensor.squeeze(0) + img_array = (img_tensor.cpu().numpy() * 255).clip(0, 255).astype(np.uint8) + return Image.fromarray(img_array) + + +def pil_to_tensor(img): + """Convert PIL Image to tensor.""" + img_array = np.array(img).astype(np.float32) / 255.0 + return torch.from_numpy(img_array)[None,] + + +# ========== Base Classes for Transform Nodes ========== + + +class ImageProcessingNode(io.ComfyNode): + """Base class for image processing nodes that operate on images. + + Child classes should set: + node_id: Unique node identifier (required) + display_name: Display name (optional, defaults to node_id) + description: Node description (optional) + extra_inputs: List of additional io.Input objects beyond "images" (optional) + is_group_process: None (auto-detect), True (group), or False (individual) (optional) + is_output_list: True (list output) or False (single output) (optional, default True) + + Child classes must implement ONE of: + _process(cls, image, **kwargs) -> tensor (for single-item processing) + _group_process(cls, images, **kwargs) -> list[tensor] (for group processing) + """ + + node_id = None + display_name = None + description = None + extra_inputs = [] + is_group_process = None # None = auto-detect, True/False = explicit + is_output_list = None # None = auto-detect based on processing mode + + @classmethod + def _detect_processing_mode(cls): + """Detect whether this node uses group or individual processing. + + Returns: + bool: True if group processing, False if individual processing + """ + # Explicit setting takes precedence + if cls.is_group_process is not None: + return cls.is_group_process + + # Check which method is overridden by looking at the defining class in MRO + base_class = ImageProcessingNode + + # Find which class in MRO defines _process + process_definer = None + for klass in cls.__mro__: + if "_process" in klass.__dict__: + process_definer = klass + break + + # Find which class in MRO defines _group_process + group_definer = None + for klass in cls.__mro__: + if "_group_process" in klass.__dict__: + group_definer = klass + break + + # Check what was overridden (not defined in base class) + has_process = process_definer is not None and process_definer is not base_class + has_group = group_definer is not None and group_definer is not base_class + + if has_process and has_group: + raise ValueError( + f"{cls.__name__}: Cannot override both _process and _group_process. " + "Override only one, or set is_group_process explicitly." + ) + if not has_process and not has_group: + raise ValueError( + f"{cls.__name__}: Must override either _process or _group_process" + ) + + return has_group + + @classmethod + def define_schema(cls): + if cls.node_id is None: + raise NotImplementedError(f"{cls.__name__} must set node_id class variable") + + is_group = cls._detect_processing_mode() + + # Auto-detect is_output_list if not explicitly set + # Single processing: False (backend collects results into list) + # Group processing: True by default (can be False for single-output nodes) + output_is_list = ( + cls.is_output_list if cls.is_output_list is not None else is_group + ) + + inputs = [ + io.Image.Input( + "images", + tooltip=( + "List of images to process." if is_group else "Image to process." + ), + ) + ] + inputs.extend(cls.extra_inputs) + + return io.Schema( + node_id=cls.node_id, + display_name=cls.display_name or cls.node_id, + category="dataset/image", + is_experimental=True, + is_input_list=is_group, # True for group, False for individual + inputs=inputs, + outputs=[ + io.Image.Output( + display_name="images", + is_output_list=output_is_list, + tooltip="Processed images", + ) + ], + ) + + @classmethod + def execute(cls, images, **kwargs): + """Execute the node. Routes to _process or _group_process based on mode.""" + is_group = cls._detect_processing_mode() + + # Extract scalar values from lists for parameters + params = {} + for k, v in kwargs.items(): + if isinstance(v, list) and len(v) == 1: + params[k] = v[0] + else: + params[k] = v + + if is_group: + # Group processing: images is list, call _group_process + result = cls._group_process(images, **params) + else: + # Individual processing: images is single item, call _process + result = cls._process(images, **params) + + return io.NodeOutput(result) + + @classmethod + def _process(cls, image, **kwargs): + """Override this method for single-item processing. + + Args: + image: tensor - Single image tensor + **kwargs: Additional parameters (already extracted from lists) + + Returns: + tensor - Processed image + """ + raise NotImplementedError(f"{cls.__name__} must implement _process method") + + @classmethod + def _group_process(cls, images, **kwargs): + """Override this method for group processing. + + Args: + images: list[tensor] - List of image tensors + **kwargs: Additional parameters (already extracted from lists) + + Returns: + list[tensor] - Processed images + """ + raise NotImplementedError( + f"{cls.__name__} must implement _group_process method" + ) + + +class TextProcessingNode(io.ComfyNode): + """Base class for text processing nodes that operate on texts. + + Child classes should set: + node_id: Unique node identifier (required) + display_name: Display name (optional, defaults to node_id) + description: Node description (optional) + extra_inputs: List of additional io.Input objects beyond "texts" (optional) + is_group_process: None (auto-detect), True (group), or False (individual) (optional) + is_output_list: True (list output) or False (single output) (optional, default True) + + Child classes must implement ONE of: + _process(cls, text, **kwargs) -> str (for single-item processing) + _group_process(cls, texts, **kwargs) -> list[str] (for group processing) + """ + + node_id = None + display_name = None + description = None + extra_inputs = [] + is_group_process = None # None = auto-detect, True/False = explicit + is_output_list = None # None = auto-detect based on processing mode + + @classmethod + def _detect_processing_mode(cls): + """Detect whether this node uses group or individual processing. + + Returns: + bool: True if group processing, False if individual processing + """ + # Explicit setting takes precedence + if cls.is_group_process is not None: + return cls.is_group_process + + # Check which method is overridden by looking at the defining class in MRO + base_class = TextProcessingNode + + # Find which class in MRO defines _process + process_definer = None + for klass in cls.__mro__: + if "_process" in klass.__dict__: + process_definer = klass + break + + # Find which class in MRO defines _group_process + group_definer = None + for klass in cls.__mro__: + if "_group_process" in klass.__dict__: + group_definer = klass + break + + # Check what was overridden (not defined in base class) + has_process = process_definer is not None and process_definer is not base_class + has_group = group_definer is not None and group_definer is not base_class + + if has_process and has_group: + raise ValueError( + f"{cls.__name__}: Cannot override both _process and _group_process. " + "Override only one, or set is_group_process explicitly." + ) + if not has_process and not has_group: + raise ValueError( + f"{cls.__name__}: Must override either _process or _group_process" + ) + + return has_group + + @classmethod + def define_schema(cls): + if cls.node_id is None: + raise NotImplementedError(f"{cls.__name__} must set node_id class variable") + + is_group = cls._detect_processing_mode() + + inputs = [ + io.String.Input( + "texts", + tooltip="List of texts to process." if is_group else "Text to process.", + ) + ] + inputs.extend(cls.extra_inputs) + + return io.Schema( + node_id=cls.node_id, + display_name=cls.display_name or cls.node_id, + category="dataset/text", + is_experimental=True, + is_input_list=is_group, # True for group, False for individual + inputs=inputs, + outputs=[ + io.String.Output( + display_name="texts", + is_output_list=cls.is_output_list, + tooltip="Processed texts", + ) + ], + ) + + @classmethod + def execute(cls, texts, **kwargs): + """Execute the node. Routes to _process or _group_process based on mode.""" + is_group = cls._detect_processing_mode() + + # Extract scalar values from lists for parameters + params = {} + for k, v in kwargs.items(): + if isinstance(v, list) and len(v) == 1: + params[k] = v[0] + else: + params[k] = v + + if is_group: + # Group processing: texts is list, call _group_process + result = cls._group_process(texts, **params) + else: + # Individual processing: texts is single item, call _process + result = cls._process(texts, **params) + + # Wrap result based on is_output_list + if cls.is_output_list: + # Result should already be a list (or will be for individual) + return io.NodeOutput(result if is_group else [result]) + else: + # Single output - wrap in list for NodeOutput + return io.NodeOutput([result]) + + @classmethod + def _process(cls, text, **kwargs): + """Override this method for single-item processing. + + Args: + text: str - Single text string + **kwargs: Additional parameters (already extracted from lists) + + Returns: + str - Processed text + """ + raise NotImplementedError(f"{cls.__name__} must implement _process method") + + @classmethod + def _group_process(cls, texts, **kwargs): + """Override this method for group processing. + + Args: + texts: list[str] - List of text strings + **kwargs: Additional parameters (already extracted from lists) + + Returns: + list[str] - Processed texts + """ + raise NotImplementedError( + f"{cls.__name__} must implement _group_process method" + ) + + +# ========== Image Transform Nodes ========== + + +class ResizeImagesToSameSizeNode(ImageProcessingNode): + node_id = "ResizeImagesToSameSize" + display_name = "Resize Images to Same Size" + description = "Resize all images to the same width and height." + extra_inputs = [ + io.Int.Input("width", default=512, min=1, max=8192, tooltip="Target width."), + io.Int.Input("height", default=512, min=1, max=8192, tooltip="Target height."), + io.Combo.Input( + "mode", + options=["stretch", "crop_center", "pad"], + default="stretch", + tooltip="Resize mode.", + ), + ] + + @classmethod + def _process(cls, image, width, height, mode): + img = tensor_to_pil(image) + + if mode == "stretch": + img = img.resize((width, height), Image.Resampling.LANCZOS) + elif mode == "crop_center": + left = max(0, (img.width - width) // 2) + top = max(0, (img.height - height) // 2) + right = min(img.width, left + width) + bottom = min(img.height, top + height) + img = img.crop((left, top, right, bottom)) + if img.width != width or img.height != height: + img = img.resize((width, height), Image.Resampling.LANCZOS) + elif mode == "pad": + img.thumbnail((width, height), Image.Resampling.LANCZOS) + new_img = Image.new("RGB", (width, height), (0, 0, 0)) + paste_x = (width - img.width) // 2 + paste_y = (height - img.height) // 2 + new_img.paste(img, (paste_x, paste_y)) + img = new_img + + return pil_to_tensor(img) + + +class ResizeImagesToPixelCountNode(ImageProcessingNode): + node_id = "ResizeImagesToPixelCount" + display_name = "Resize Images to Pixel Count" + description = "Resize images so that the total pixel count matches the specified number while preserving aspect ratio." + extra_inputs = [ + io.Int.Input( + "pixel_count", + default=512 * 512, + min=1, + max=8192 * 8192, + tooltip="Target pixel count.", + ), + io.Int.Input( + "steps", + default=64, + min=1, + max=128, + tooltip="The stepping for resize width/height.", + ), + ] + + @classmethod + def _process(cls, image, pixel_count, steps): + img = tensor_to_pil(image) + w, h = img.size + pixel_count_ratio = math.sqrt(pixel_count / (w * h)) + new_w = int(w * pixel_count_ratio / steps) * steps + new_h = int(h * pixel_count_ratio / steps) * steps + logging.info(f"Resizing from {w}x{h} to {new_w}x{new_h}") + img = img.resize((new_w, new_h), Image.Resampling.LANCZOS) + return pil_to_tensor(img) + + +class ResizeImagesByShorterEdgeNode(ImageProcessingNode): + node_id = "ResizeImagesByShorterEdge" + display_name = "Resize Images by Shorter Edge" + description = "Resize images so that the shorter edge matches the specified length while preserving aspect ratio." + extra_inputs = [ + io.Int.Input( + "shorter_edge", + default=512, + min=1, + max=8192, + tooltip="Target length for the shorter edge.", + ), + ] + + @classmethod + def _process(cls, image, shorter_edge): + img = tensor_to_pil(image) + w, h = img.size + if w < h: + new_w = shorter_edge + new_h = int(h * (shorter_edge / w)) + else: + new_h = shorter_edge + new_w = int(w * (shorter_edge / h)) + img = img.resize((new_w, new_h), Image.Resampling.LANCZOS) + return pil_to_tensor(img) + + +class ResizeImagesByLongerEdgeNode(ImageProcessingNode): + node_id = "ResizeImagesByLongerEdge" + display_name = "Resize Images by Longer Edge" + description = "Resize images so that the longer edge matches the specified length while preserving aspect ratio." + extra_inputs = [ + io.Int.Input( + "longer_edge", + default=1024, + min=1, + max=8192, + tooltip="Target length for the longer edge.", + ), + ] + + @classmethod + def _process(cls, image, longer_edge): + img = tensor_to_pil(image) + w, h = img.size + if w > h: + new_w = longer_edge + new_h = int(h * (longer_edge / w)) + else: + new_h = longer_edge + new_w = int(w * (longer_edge / h)) + img = img.resize((new_w, new_h), Image.Resampling.LANCZOS) + return pil_to_tensor(img) + + +class CenterCropImagesNode(ImageProcessingNode): + node_id = "CenterCropImages" + display_name = "Center Crop Images" + description = "Center crop all images to the specified dimensions." + extra_inputs = [ + io.Int.Input("width", default=512, min=1, max=8192, tooltip="Crop width."), + io.Int.Input("height", default=512, min=1, max=8192, tooltip="Crop height."), + ] + + @classmethod + def _process(cls, image, width, height): + img = tensor_to_pil(image) + left = max(0, (img.width - width) // 2) + top = max(0, (img.height - height) // 2) + right = min(img.width, left + width) + bottom = min(img.height, top + height) + img = img.crop((left, top, right, bottom)) + return pil_to_tensor(img) + + +class RandomCropImagesNode(ImageProcessingNode): + node_id = "RandomCropImages" + display_name = "Random Crop Images" + description = ( + "Randomly crop all images to the specified dimensions (for data augmentation)." + ) + extra_inputs = [ + io.Int.Input("width", default=512, min=1, max=8192, tooltip="Crop width."), + io.Int.Input("height", default=512, min=1, max=8192, tooltip="Crop height."), + io.Int.Input( + "seed", default=0, min=0, max=0xFFFFFFFFFFFFFFFF, tooltip="Random seed." + ), + ] + + @classmethod + def _process(cls, image, width, height, seed): + np.random.seed(seed % (2**32 - 1)) + img = tensor_to_pil(image) + max_left = max(0, img.width - width) + max_top = max(0, img.height - height) + left = np.random.randint(0, max_left + 1) if max_left > 0 else 0 + top = np.random.randint(0, max_top + 1) if max_top > 0 else 0 + right = min(img.width, left + width) + bottom = min(img.height, top + height) + img = img.crop((left, top, right, bottom)) + return pil_to_tensor(img) + + +class FlipImagesNode(ImageProcessingNode): + node_id = "FlipImages" + display_name = "Flip Images" + description = "Flip all images horizontally or vertically." + extra_inputs = [ + io.Combo.Input( + "direction", + options=["horizontal", "vertical"], + default="horizontal", + tooltip="Flip direction.", + ), + ] + + @classmethod + def _process(cls, image, direction): + img = tensor_to_pil(image) + if direction == "horizontal": + img = img.transpose(Image.FLIP_LEFT_RIGHT) + else: + img = img.transpose(Image.FLIP_TOP_BOTTOM) + return pil_to_tensor(img) + + +class NormalizeImagesNode(ImageProcessingNode): + node_id = "NormalizeImages" + display_name = "Normalize Images" + description = "Normalize images using mean and standard deviation." + extra_inputs = [ + io.Float.Input( + "mean", + default=0.5, + min=0.0, + max=1.0, + tooltip="Mean value for normalization.", + ), + io.Float.Input( + "std", + default=0.5, + min=0.001, + max=1.0, + tooltip="Standard deviation for normalization.", + ), + ] + + @classmethod + def _process(cls, image, mean, std): + return (image - mean) / std + + +class AdjustBrightnessNode(ImageProcessingNode): + node_id = "AdjustBrightness" + display_name = "Adjust Brightness" + description = "Adjust brightness of all images." + extra_inputs = [ + io.Float.Input( + "factor", + default=1.0, + min=0.0, + max=2.0, + tooltip="Brightness factor. 1.0 = no change, <1.0 = darker, >1.0 = brighter.", + ), + ] + + @classmethod + def _process(cls, image, factor): + return (image * factor).clamp(0.0, 1.0) + + +class AdjustContrastNode(ImageProcessingNode): + node_id = "AdjustContrast" + display_name = "Adjust Contrast" + description = "Adjust contrast of all images." + extra_inputs = [ + io.Float.Input( + "factor", + default=1.0, + min=0.0, + max=2.0, + tooltip="Contrast factor. 1.0 = no change, <1.0 = less contrast, >1.0 = more contrast.", + ), + ] + + @classmethod + def _process(cls, image, factor): + return ((image - 0.5) * factor + 0.5).clamp(0.0, 1.0) + + +class ShuffleDatasetNode(ImageProcessingNode): + node_id = "ShuffleDataset" + display_name = "Shuffle Image Dataset" + description = "Randomly shuffle the order of images in the dataset." + is_group_process = True # Requires full list to shuffle + extra_inputs = [ + io.Int.Input( + "seed", default=0, min=0, max=0xFFFFFFFFFFFFFFFF, tooltip="Random seed." + ), + ] + + @classmethod + def _group_process(cls, images, seed): + np.random.seed(seed % (2**32 - 1)) + indices = np.random.permutation(len(images)) + return [images[i] for i in indices] + + +class ShuffleImageTextDatasetNode(io.ComfyNode): + """Special node that shuffles both images and texts together.""" + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="ShuffleImageTextDataset", + display_name="Shuffle Image-Text Dataset", + category="dataset/image", + is_experimental=True, + is_input_list=True, + inputs=[ + io.Image.Input("images", tooltip="List of images to shuffle."), + io.String.Input("texts", tooltip="List of texts to shuffle."), + io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + tooltip="Random seed.", + ), + ], + outputs=[ + io.Image.Output( + display_name="images", + is_output_list=True, + tooltip="Shuffled images", + ), + io.String.Output( + display_name="texts", is_output_list=True, tooltip="Shuffled texts" + ), + ], + ) + + @classmethod + def execute(cls, images, texts, seed): + seed = seed[0] # Extract scalar + np.random.seed(seed % (2**32 - 1)) + indices = np.random.permutation(len(images)) + shuffled_images = [images[i] for i in indices] + shuffled_texts = [texts[i] for i in indices] + return io.NodeOutput(shuffled_images, shuffled_texts) + + +# ========== Text Transform Nodes ========== + + +class TextToLowercaseNode(TextProcessingNode): + node_id = "TextToLowercase" + display_name = "Text to Lowercase" + description = "Convert all texts to lowercase." + + @classmethod + def _process(cls, text): + return text.lower() + + +class TextToUppercaseNode(TextProcessingNode): + node_id = "TextToUppercase" + display_name = "Text to Uppercase" + description = "Convert all texts to uppercase." + + @classmethod + def _process(cls, text): + return text.upper() + + +class TruncateTextNode(TextProcessingNode): + node_id = "TruncateText" + display_name = "Truncate Text" + description = "Truncate all texts to a maximum length." + extra_inputs = [ + io.Int.Input( + "max_length", default=77, min=1, max=10000, tooltip="Maximum text length." + ), + ] + + @classmethod + def _process(cls, text, max_length): + return text[:max_length] + + +class AddTextPrefixNode(TextProcessingNode): + node_id = "AddTextPrefix" + display_name = "Add Text Prefix" + description = "Add a prefix to all texts." + extra_inputs = [ + io.String.Input("prefix", default="", tooltip="Prefix to add."), + ] + + @classmethod + def _process(cls, text, prefix): + return prefix + text + + +class AddTextSuffixNode(TextProcessingNode): + node_id = "AddTextSuffix" + display_name = "Add Text Suffix" + description = "Add a suffix to all texts." + extra_inputs = [ + io.String.Input("suffix", default="", tooltip="Suffix to add."), + ] + + @classmethod + def _process(cls, text, suffix): + return text + suffix + + +class ReplaceTextNode(TextProcessingNode): + node_id = "ReplaceText" + display_name = "Replace Text" + description = "Replace text in all texts." + extra_inputs = [ + io.String.Input("find", default="", tooltip="Text to find."), + io.String.Input("replace", default="", tooltip="Text to replace with."), + ] + + @classmethod + def _process(cls, text, find, replace): + return text.replace(find, replace) + + +class StripWhitespaceNode(TextProcessingNode): + node_id = "StripWhitespace" + display_name = "Strip Whitespace" + description = "Strip leading and trailing whitespace from all texts." + + @classmethod + def _process(cls, text): + return text.strip() + + +# ========== Group Processing Example Nodes ========== + + +class ImageDeduplicationNode(ImageProcessingNode): + """Remove duplicate or very similar images from the dataset using perceptual hashing.""" + + node_id = "ImageDeduplication" + display_name = "Image Deduplication" + description = "Remove duplicate or very similar images from the dataset." + is_group_process = True # Requires full list to compare images + extra_inputs = [ + io.Float.Input( + "similarity_threshold", + default=0.95, + min=0.0, + max=1.0, + tooltip="Similarity threshold (0-1). Higher means more similar. Images above this threshold are considered duplicates.", + ), + ] + + @classmethod + def _group_process(cls, images, similarity_threshold): + """Remove duplicate images using perceptual hashing.""" + if len(images) == 0: + return [] + + # Compute simple perceptual hash for each image + def compute_hash(img_tensor): + """Compute a simple perceptual hash by resizing to 8x8 and comparing to average.""" + img = tensor_to_pil(img_tensor) + # Resize to 8x8 + img_small = img.resize((8, 8), Image.Resampling.LANCZOS).convert("L") + # Get pixels + pixels = list(img_small.getdata()) + # Compute average + avg = sum(pixels) / len(pixels) + # Create hash (1 if above average, 0 otherwise) + hash_bits = "".join("1" if p > avg else "0" for p in pixels) + return hash_bits + + def hamming_distance(hash1, hash2): + """Compute Hamming distance between two hash strings.""" + return sum(c1 != c2 for c1, c2 in zip(hash1, hash2)) + + # Compute hashes for all images + hashes = [compute_hash(img) for img in images] + + # Find duplicates + keep_indices = [] + for i in range(len(images)): + is_duplicate = False + for j in keep_indices: + # Compare hashes + distance = hamming_distance(hashes[i], hashes[j]) + similarity = 1.0 - (distance / 64.0) # 64 bits total + if similarity >= similarity_threshold: + is_duplicate = True + logging.info( + f"Image {i} is similar to image {j} (similarity: {similarity:.3f}), skipping" + ) + break + + if not is_duplicate: + keep_indices.append(i) + + # Return only unique images + unique_images = [images[i] for i in keep_indices] + logging.info( + f"Deduplication: kept {len(unique_images)} out of {len(images)} images" + ) + return unique_images + + +class ImageGridNode(ImageProcessingNode): + """Combine multiple images into a single grid/collage.""" + + node_id = "ImageGrid" + display_name = "Image Grid" + description = "Arrange multiple images into a grid layout." + is_group_process = True # Requires full list to create grid + is_output_list = False # Outputs single grid image + extra_inputs = [ + io.Int.Input( + "columns", + default=4, + min=1, + max=20, + tooltip="Number of columns in the grid.", + ), + io.Int.Input( + "cell_width", + default=256, + min=32, + max=2048, + tooltip="Width of each cell in the grid.", + ), + io.Int.Input( + "cell_height", + default=256, + min=32, + max=2048, + tooltip="Height of each cell in the grid.", + ), + io.Int.Input( + "padding", default=4, min=0, max=50, tooltip="Padding between images." + ), + ] + + @classmethod + def _group_process(cls, images, columns, cell_width, cell_height, padding): + """Arrange images into a grid.""" + if len(images) == 0: + raise ValueError("Cannot create grid from empty image list") + + # Calculate grid dimensions + num_images = len(images) + rows = (num_images + columns - 1) // columns # Ceiling division + + # Calculate total grid size + grid_width = columns * cell_width + (columns - 1) * padding + grid_height = rows * cell_height + (rows - 1) * padding + + # Create blank grid + grid = Image.new("RGB", (grid_width, grid_height), (0, 0, 0)) + + # Place images + for idx, img_tensor in enumerate(images): + row = idx // columns + col = idx % columns + + # Convert to PIL and resize to cell size + img = tensor_to_pil(img_tensor) + img = img.resize((cell_width, cell_height), Image.Resampling.LANCZOS) + + # Calculate position + x = col * (cell_width + padding) + y = row * (cell_height + padding) + + # Paste into grid + grid.paste(img, (x, y)) + + logging.info( + f"Created {columns}x{rows} grid with {num_images} images ({grid_width}x{grid_height})" + ) + return pil_to_tensor(grid) + + +class MergeImageListsNode(ImageProcessingNode): + """Merge multiple image lists into a single list.""" + + node_id = "MergeImageLists" + display_name = "Merge Image Lists" + description = "Concatenate multiple image lists into one." + is_group_process = True # Receives images as list + + @classmethod + def _group_process(cls, images): + """Simply return the images list (already merged by input handling).""" + # When multiple list inputs are connected, they're concatenated + # For now, this is a simple pass-through + logging.info(f"Merged image list contains {len(images)} images") + return images + + +class MergeTextListsNode(TextProcessingNode): + """Merge multiple text lists into a single list.""" + + node_id = "MergeTextLists" + display_name = "Merge Text Lists" + description = "Concatenate multiple text lists into one." + is_group_process = True # Receives texts as list + + @classmethod + def _group_process(cls, texts): + """Simply return the texts list (already merged by input handling).""" + # When multiple list inputs are connected, they're concatenated + # For now, this is a simple pass-through + logging.info(f"Merged text list contains {len(texts)} texts") + return texts + + +# ========== Training Dataset Nodes ========== + + +class MakeTrainingDataset(io.ComfyNode): + """Encode images with VAE and texts with CLIP to create a training dataset.""" + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="MakeTrainingDataset", + display_name="Make Training Dataset", + category="dataset", + is_experimental=True, + is_input_list=True, # images and texts as lists + inputs=[ + io.Image.Input("images", tooltip="List of images to encode."), + io.Vae.Input( + "vae", tooltip="VAE model for encoding images to latents." + ), + io.Clip.Input( + "clip", tooltip="CLIP model for encoding text to conditioning." + ), + io.String.Input( + "texts", + optional=True, + tooltip="List of text captions. Can be length n (matching images), 1 (repeated for all), or omitted (uses empty string).", + ), + ], + outputs=[ + io.Latent.Output( + display_name="latents", + is_output_list=True, + tooltip="List of latent dicts", + ), + io.Conditioning.Output( + display_name="conditioning", + is_output_list=True, + tooltip="List of conditioning lists", + ), + ], + ) + + @classmethod + def execute(cls, images, vae, clip, texts=None): + # Extract scalars (vae and clip are single values wrapped in lists) + vae = vae[0] + clip = clip[0] + + # Handle text list + num_images = len(images) + + if texts is None or len(texts) == 0: + # Treat as [""] for unconditional training + texts = [""] + + if len(texts) == 1 and num_images > 1: + # Repeat single text for all images + texts = texts * num_images + elif len(texts) != num_images: + raise ValueError( + f"Number of texts ({len(texts)}) does not match number of images ({num_images}). " + f"Text list should have length {num_images}, 1, or 0." + ) + + # Encode images with VAE + logging.info(f"Encoding {num_images} images with VAE...") + latents_list = [] # list[{"samples": tensor}] + for img_tensor in images: + # img_tensor is [1, H, W, 3] + latent_tensor = vae.encode(img_tensor[:, :, :, :3]) + latents_list.append({"samples": latent_tensor}) + + # Encode texts with CLIP + logging.info(f"Encoding {len(texts)} texts with CLIP...") + conditioning_list = [] # list[list[cond]] + for text in texts: + if text == "": + cond = clip.encode_from_tokens_scheduled(clip.tokenize("")) + else: + tokens = clip.tokenize(text) + cond = clip.encode_from_tokens_scheduled(tokens) + conditioning_list.append(cond) + + logging.info( + f"Created dataset with {len(latents_list)} latents and {len(conditioning_list)} conditioning." + ) + return io.NodeOutput(latents_list, conditioning_list) + + +class SaveTrainingDataset(io.ComfyNode): + """Save encoded training dataset (latents + conditioning) to disk.""" + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SaveTrainingDataset", + display_name="Save Training Dataset", + category="dataset", + is_experimental=True, + is_output_node=True, + is_input_list=True, # Receive lists + inputs=[ + io.Latent.Input( + "latents", + tooltip="List of latent dicts from MakeTrainingDataset.", + ), + io.Conditioning.Input( + "conditioning", + tooltip="List of conditioning lists from MakeTrainingDataset.", + ), + io.String.Input( + "folder_name", + default="training_dataset", + tooltip="Name of folder to save dataset (inside output directory).", + ), + io.Int.Input( + "shard_size", + default=1000, + min=1, + max=100000, + tooltip="Number of samples per shard file.", + ), + ], + outputs=[], + ) + + @classmethod + def execute(cls, latents, conditioning, folder_name, shard_size): + # Extract scalars + folder_name = folder_name[0] + shard_size = shard_size[0] + + # latents: list[{"samples": tensor}] + # conditioning: list[list[cond]] + + # Validate lengths match + if len(latents) != len(conditioning): + raise ValueError( + f"Number of latents ({len(latents)}) does not match number of conditions ({len(conditioning)}). " + f"Something went wrong in dataset preparation." + ) + + # Create output directory + output_dir = os.path.join(folder_paths.get_output_directory(), folder_name) + os.makedirs(output_dir, exist_ok=True) + + # Prepare data pairs + num_samples = len(latents) + num_shards = (num_samples + shard_size - 1) // shard_size # Ceiling division + + logging.info( + f"Saving {num_samples} samples to {num_shards} shards in {output_dir}..." + ) + + # Save data in shards + for shard_idx in range(num_shards): + start_idx = shard_idx * shard_size + end_idx = min(start_idx + shard_size, num_samples) + + # Get shard data (list of latent dicts and conditioning lists) + shard_data = { + "latents": latents[start_idx:end_idx], + "conditioning": conditioning[start_idx:end_idx], + } + + # Save shard + shard_filename = f"shard_{shard_idx:04d}.pkl" + shard_path = os.path.join(output_dir, shard_filename) + + with open(shard_path, "wb") as f: + torch.save(shard_data, f) + + logging.info( + f"Saved shard {shard_idx + 1}/{num_shards}: {shard_filename} ({end_idx - start_idx} samples)" + ) + + # Save metadata + metadata = { + "num_samples": num_samples, + "num_shards": num_shards, + "shard_size": shard_size, + } + metadata_path = os.path.join(output_dir, "metadata.json") + with open(metadata_path, "w") as f: + json.dump(metadata, f, indent=2) + + logging.info(f"Successfully saved {num_samples} samples to {output_dir}.") + return io.NodeOutput() + + +class LoadTrainingDataset(io.ComfyNode): + """Load encoded training dataset from disk.""" + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="LoadTrainingDataset", + display_name="Load Training Dataset", + category="dataset", + is_experimental=True, + inputs=[ + io.String.Input( + "folder_name", + default="training_dataset", + tooltip="Name of folder containing the saved dataset (inside output directory).", + ), + ], + outputs=[ + io.Latent.Output( + display_name="latents", + is_output_list=True, + tooltip="List of latent dicts", + ), + io.Conditioning.Output( + display_name="conditioning", + is_output_list=True, + tooltip="List of conditioning lists", + ), + ], + ) + + @classmethod + def execute(cls, folder_name): + # Get dataset directory + dataset_dir = os.path.join(folder_paths.get_output_directory(), folder_name) + + if not os.path.exists(dataset_dir): + raise ValueError(f"Dataset directory not found: {dataset_dir}") + + # Find all shard files + shard_files = sorted( + [ + f + for f in os.listdir(dataset_dir) + if f.startswith("shard_") and f.endswith(".pkl") + ] + ) + + if not shard_files: + raise ValueError(f"No shard files found in {dataset_dir}") + + logging.info(f"Loading {len(shard_files)} shards from {dataset_dir}...") + + # Load all shards + all_latents = [] # list[{"samples": tensor}] + all_conditioning = [] # list[list[cond]] + + for shard_file in shard_files: + shard_path = os.path.join(dataset_dir, shard_file) + + with open(shard_path, "rb") as f: + shard_data = torch.load(f) + + all_latents.extend(shard_data["latents"]) + all_conditioning.extend(shard_data["conditioning"]) + + logging.info(f"Loaded {shard_file}: {len(shard_data['latents'])} samples") + + logging.info( + f"Successfully loaded {len(all_latents)} samples from {dataset_dir}." + ) + return io.NodeOutput(all_latents, all_conditioning) + + +# ========== Extension Setup ========== + + +class DatasetExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + # Data loading/saving nodes + LoadImageDataSetFromFolderNode, + LoadImageTextDataSetFromFolderNode, + SaveImageDataSetToFolderNode, + SaveImageTextDataSetToFolderNode, + # Image transform nodes + ResizeImagesToSameSizeNode, + ResizeImagesToPixelCountNode, + ResizeImagesByShorterEdgeNode, + ResizeImagesByLongerEdgeNode, + CenterCropImagesNode, + RandomCropImagesNode, + FlipImagesNode, + NormalizeImagesNode, + AdjustBrightnessNode, + AdjustContrastNode, + ShuffleDatasetNode, + ShuffleImageTextDatasetNode, + # Text transform nodes + TextToLowercaseNode, + TextToUppercaseNode, + TruncateTextNode, + AddTextPrefixNode, + AddTextSuffixNode, + ReplaceTextNode, + StripWhitespaceNode, + # Group processing examples + ImageDeduplicationNode, + ImageGridNode, + MergeImageListsNode, + MergeTextListsNode, + # Training dataset nodes + MakeTrainingDataset, + SaveTrainingDataset, + LoadTrainingDataset, + ] + + +async def comfy_entrypoint() -> DatasetExtension: + return DatasetExtension() diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index 9e6ec6780..cb24ab709 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -1,15 +1,13 @@ -import datetime -import json import logging import os import numpy as np import safetensors import torch -from PIL import Image, ImageDraw, ImageFont -from PIL.PngImagePlugin import PngInfo import torch.utils.checkpoint -import tqdm +from tqdm.auto import trange +from PIL import Image, ImageDraw, ImageFont +from typing_extensions import override import comfy.samplers import comfy.sd @@ -18,9 +16,9 @@ import comfy.model_management import comfy_extras.nodes_custom_sampler import folder_paths import node_helpers -from comfy.cli_args import args -from comfy.comfy_types.node_typing import IO from comfy.weight_adapter import adapters, adapter_maps +from comfy_api.latest import ComfyExtension, io, ui +from comfy.utils import ProgressBar def make_batch_extra_option_dict(d, indicies, full_size=None): @@ -56,7 +54,18 @@ def process_cond_list(d, prefix=""): class TrainSampler(comfy.samplers.Sampler): - def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, grad_acc=1, total_steps=1, seed=0, training_dtype=torch.bfloat16): + def __init__( + self, + loss_fn, + optimizer, + loss_callback=None, + batch_size=1, + grad_acc=1, + total_steps=1, + seed=0, + training_dtype=torch.bfloat16, + real_dataset=None, + ): self.loss_fn = loss_fn self.optimizer = optimizer self.loss_callback = loss_callback @@ -65,54 +74,138 @@ class TrainSampler(comfy.samplers.Sampler): self.grad_acc = grad_acc self.seed = seed self.training_dtype = training_dtype + self.real_dataset: list[torch.Tensor] | None = real_dataset - def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False): + def fwd_bwd( + self, + model_wrap, + batch_sigmas, + batch_noise, + batch_latent, + cond, + indicies, + extra_args, + dataset_size, + bwd=True, + ): + xt = model_wrap.inner_model.model_sampling.noise_scaling( + batch_sigmas, batch_noise, batch_latent, False + ) + x0 = model_wrap.inner_model.model_sampling.noise_scaling( + torch.zeros_like(batch_sigmas), + torch.zeros_like(batch_noise), + batch_latent, + False, + ) + + model_wrap.conds["positive"] = [cond[i] for i in indicies] + batch_extra_args = make_batch_extra_option_dict( + extra_args, indicies, full_size=dataset_size + ) + + with torch.autocast(xt.device.type, dtype=self.training_dtype): + x0_pred = model_wrap( + xt.requires_grad_(True), + batch_sigmas.requires_grad_(True), + **batch_extra_args, + ) + loss = self.loss_fn(x0_pred, x0) + if bwd: + bwd_loss = loss / self.grad_acc + bwd_loss.backward() + return loss + + def sample( + self, + model_wrap, + sigmas, + extra_args, + callback, + noise, + latent_image=None, + denoise_mask=None, + disable_pbar=False, + ): model_wrap.conds = process_cond_list(model_wrap.conds) cond = model_wrap.conds["positive"] dataset_size = sigmas.size(0) torch.cuda.empty_cache() - for i in (pbar:=tqdm.trange(self.total_steps, desc="Training LoRA", smoothing=0.01, disable=not comfy.utils.PROGRESS_BAR_ENABLED)): - noisegen = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(self.seed + i * 1000) - indicies = torch.randperm(dataset_size)[:self.batch_size].tolist() - - batch_latent = torch.stack([latent_image[i] for i in indicies]) - batch_noise = noisegen.generate_noise({"samples": batch_latent}).to(batch_latent.device) - batch_sigmas = [ - model_wrap.inner_model.model_sampling.percent_to_sigma( - torch.rand((1,)).item() - ) for _ in range(min(self.batch_size, dataset_size)) - ] - batch_sigmas = torch.tensor(batch_sigmas).to(batch_latent.device) - - xt = model_wrap.inner_model.model_sampling.noise_scaling( - batch_sigmas, - batch_noise, - batch_latent, - False + ui_pbar = ProgressBar(self.total_steps) + for i in ( + pbar := trange( + self.total_steps, + desc="Training LoRA", + smoothing=0.01, + disable=not comfy.utils.PROGRESS_BAR_ENABLED, ) - x0 = model_wrap.inner_model.model_sampling.noise_scaling( - torch.zeros_like(batch_sigmas), - torch.zeros_like(batch_noise), - batch_latent, - False + ): + noisegen = comfy_extras.nodes_custom_sampler.Noise_RandomNoise( + self.seed + i * 1000 ) + indicies = torch.randperm(dataset_size)[: self.batch_size].tolist() - model_wrap.conds["positive"] = [ - cond[i] for i in indicies - ] - batch_extra_args = make_batch_extra_option_dict(extra_args, indicies, full_size=dataset_size) + if self.real_dataset is None: + batch_latent = torch.stack([latent_image[i] for i in indicies]) + batch_noise = noisegen.generate_noise({"samples": batch_latent}).to( + batch_latent.device + ) + batch_sigmas = [ + model_wrap.inner_model.model_sampling.percent_to_sigma( + torch.rand((1,)).item() + ) + for _ in range(min(self.batch_size, dataset_size)) + ] + batch_sigmas = torch.tensor(batch_sigmas).to(batch_latent.device) - with torch.autocast(xt.device.type, dtype=self.training_dtype): - x0_pred = model_wrap(xt, batch_sigmas, **batch_extra_args) - loss = self.loss_fn(x0_pred, x0) - loss.backward() - if self.loss_callback: - self.loss_callback(loss.item()) - pbar.set_postfix({"loss": f"{loss.item():.4f}"}) + loss = self.fwd_bwd( + model_wrap, + batch_sigmas, + batch_noise, + batch_latent, + cond, + indicies, + extra_args, + dataset_size, + bwd=True, + ) + if self.loss_callback: + self.loss_callback(loss.item()) + pbar.set_postfix({"loss": f"{loss.item():.4f}"}) + else: + total_loss = 0 + for index in indicies: + single_latent = self.real_dataset[index].to(latent_image) + batch_noise = noisegen.generate_noise( + {"samples": single_latent} + ).to(single_latent.device) + batch_sigmas = ( + model_wrap.inner_model.model_sampling.percent_to_sigma( + torch.rand((1,)).item() + ) + ) + batch_sigmas = torch.tensor([batch_sigmas]).to(single_latent.device) + loss = self.fwd_bwd( + model_wrap, + batch_sigmas, + batch_noise, + single_latent, + cond, + [index], + extra_args, + dataset_size, + bwd=False, + ) + total_loss += loss + total_loss = total_loss / self.grad_acc / len(indicies) + total_loss.backward() + if self.loss_callback: + self.loss_callback(total_loss.item()) + pbar.set_postfix({"loss": f"{total_loss.item():.4f}"}) - if (i+1) % self.grad_acc == 0: + if (i + 1) % self.grad_acc == 0: self.optimizer.step() self.optimizer.zero_grad() + ui_pbar.update(1) torch.cuda.empty_cache() return torch.zeros_like(latent_image) @@ -134,233 +227,6 @@ class BiasDiff(torch.nn.Module): return self.passive_memory_usage() -def load_and_process_images(image_files, input_dir, resize_method="None", w=None, h=None): - """Utility function to load and process a list of images. - - Args: - image_files: List of image filenames - input_dir: Base directory containing the images - resize_method: How to handle images of different sizes ("None", "Stretch", "Crop", "Pad") - - Returns: - torch.Tensor: Batch of processed images - """ - if not image_files: - raise ValueError("No valid images found in input") - - output_images = [] - - for file in image_files: - image_path = os.path.join(input_dir, file) - img = node_helpers.pillow(Image.open, image_path) - - if img.mode == "I": - img = img.point(lambda i: i * (1 / 255)) - img = img.convert("RGB") - - if w is None and h is None: - w, h = img.size[0], img.size[1] - - # Resize image to first image - if img.size[0] != w or img.size[1] != h: - if resize_method == "Stretch": - img = img.resize((w, h), Image.Resampling.LANCZOS) - elif resize_method == "Crop": - img = img.crop((0, 0, w, h)) - elif resize_method == "Pad": - img = img.resize((w, h), Image.Resampling.LANCZOS) - elif resize_method == "None": - raise ValueError( - "Your input image size does not match the first image in the dataset. Either select a valid resize method or use the same size for all images." - ) - - img_array = np.array(img).astype(np.float32) / 255.0 - img_tensor = torch.from_numpy(img_array)[None,] - output_images.append(img_tensor) - - return torch.cat(output_images, dim=0) - - -class LoadImageSetNode: - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "images": ( - [ - f - for f in os.listdir(folder_paths.get_input_directory()) - if f.endswith((".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif", ".jpe", ".apng", ".tif", ".tiff")) - ], - {"image_upload": True, "allow_batch": True}, - ) - }, - "optional": { - "resize_method": ( - ["None", "Stretch", "Crop", "Pad"], - {"default": "None"}, - ), - }, - } - - INPUT_IS_LIST = True - RETURN_TYPES = ("IMAGE",) - FUNCTION = "load_images" - CATEGORY = "loaders" - EXPERIMENTAL = True - DESCRIPTION = "Loads a batch of images from a directory for training." - - @classmethod - def VALIDATE_INPUTS(s, images, resize_method): - filenames = images[0] if isinstance(images[0], list) else images - - for image in filenames: - if not folder_paths.exists_annotated_filepath(image): - return "Invalid image file: {}".format(image) - return True - - def load_images(self, input_files, resize_method): - input_dir = folder_paths.get_input_directory() - valid_extensions = [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif", ".jpe", ".apng", ".tif", ".tiff"] - image_files = [ - f - for f in input_files - if any(f.lower().endswith(ext) for ext in valid_extensions) - ] - output_tensor = load_and_process_images(image_files, input_dir, resize_method) - return (output_tensor,) - - -class LoadImageSetFromFolderNode: - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "folder": (folder_paths.get_input_subfolders(), {"tooltip": "The folder to load images from."}) - }, - "optional": { - "resize_method": ( - ["None", "Stretch", "Crop", "Pad"], - {"default": "None"}, - ), - }, - } - - RETURN_TYPES = ("IMAGE",) - FUNCTION = "load_images" - CATEGORY = "loaders" - EXPERIMENTAL = True - DESCRIPTION = "Loads a batch of images from a directory for training." - - def load_images(self, folder, resize_method): - sub_input_dir = os.path.join(folder_paths.get_input_directory(), folder) - valid_extensions = [".png", ".jpg", ".jpeg", ".webp"] - image_files = [ - f - for f in os.listdir(sub_input_dir) - if any(f.lower().endswith(ext) for ext in valid_extensions) - ] - output_tensor = load_and_process_images(image_files, sub_input_dir, resize_method) - return (output_tensor,) - - -class LoadImageTextSetFromFolderNode: - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "folder": (folder_paths.get_input_subfolders(), {"tooltip": "The folder to load images from."}), - "clip": (IO.CLIP, {"tooltip": "The CLIP model used for encoding the text."}), - }, - "optional": { - "resize_method": ( - ["None", "Stretch", "Crop", "Pad"], - {"default": "None"}, - ), - "width": ( - IO.INT, - { - "default": -1, - "min": -1, - "max": 10000, - "step": 1, - "tooltip": "The width to resize the images to. -1 means use the original width.", - }, - ), - "height": ( - IO.INT, - { - "default": -1, - "min": -1, - "max": 10000, - "step": 1, - "tooltip": "The height to resize the images to. -1 means use the original height.", - }, - ) - }, - } - - RETURN_TYPES = ("IMAGE", IO.CONDITIONING,) - FUNCTION = "load_images" - CATEGORY = "loaders" - EXPERIMENTAL = True - DESCRIPTION = "Loads a batch of images and caption from a directory for training." - - def load_images(self, folder, clip, resize_method, width=None, height=None): - if clip is None: - raise RuntimeError("ERROR: clip input is invalid: None\n\nIf the clip is from a checkpoint loader node your checkpoint does not contain a valid clip or text encoder model.") - - logging.info(f"Loading images from folder: {folder}") - - sub_input_dir = os.path.join(folder_paths.get_input_directory(), folder) - valid_extensions = [".png", ".jpg", ".jpeg", ".webp"] - - image_files = [] - for item in os.listdir(sub_input_dir): - path = os.path.join(sub_input_dir, item) - if any(item.lower().endswith(ext) for ext in valid_extensions): - image_files.append(path) - elif os.path.isdir(path): - # Support kohya-ss/sd-scripts folder structure - repeat = 1 - if item.split("_")[0].isdigit(): - repeat = int(item.split("_")[0]) - image_files.extend([ - os.path.join(path, f) for f in os.listdir(path) if any(f.lower().endswith(ext) for ext in valid_extensions) - ] * repeat) - - caption_file_path = [ - f.replace(os.path.splitext(f)[1], ".txt") - for f in image_files - ] - captions = [] - for caption_file in caption_file_path: - caption_path = os.path.join(sub_input_dir, caption_file) - if os.path.exists(caption_path): - with open(caption_path, "r", encoding="utf-8") as f: - caption = f.read().strip() - captions.append(caption) - else: - captions.append("") - - width = width if width != -1 else None - height = height if height != -1 else None - output_tensor = load_and_process_images(image_files, sub_input_dir, resize_method, width, height) - - logging.info(f"Loaded {len(output_tensor)} images from {sub_input_dir}.") - - logging.info(f"Encoding captions from {sub_input_dir}.") - conditions = [] - empty_cond = clip.encode_from_tokens_scheduled(clip.tokenize("")) - for text in captions: - if text == "": - conditions.append(empty_cond) - tokens = clip.tokenize(text) - conditions.extend(clip.encode_from_tokens_scheduled(tokens)) - logging.info(f"Encoded {len(conditions)} captions from {sub_input_dir}.") - return (output_tensor, conditions) - - def draw_loss_graph(loss_map, steps): width, height = 500, 300 img = Image.new("RGB", (width, height), "white") @@ -379,10 +245,14 @@ def draw_loss_graph(loss_map, steps): return img -def find_all_highest_child_module_with_forward(model: torch.nn.Module, result = None, name = None): +def find_all_highest_child_module_with_forward( + model: torch.nn.Module, result=None, name=None +): if result is None: result = [] - elif hasattr(model, "forward") and not isinstance(model, (torch.nn.ModuleList, torch.nn.Sequential, torch.nn.ModuleDict)): + elif hasattr(model, "forward") and not isinstance( + model, (torch.nn.ModuleList, torch.nn.Sequential, torch.nn.ModuleDict) + ): result.append(model) logging.debug(f"Found module with forward: {name} ({model.__class__.__name__})") return result @@ -396,12 +266,13 @@ def patch(m): if not hasattr(m, "forward"): return org_forward = m.forward + def fwd(args, kwargs): return org_forward(*args, **kwargs) + def checkpointing_fwd(*args, **kwargs): - return torch.utils.checkpoint.checkpoint( - fwd, args, kwargs, use_reentrant=False - ) + return torch.utils.checkpoint.checkpoint(fwd, args, kwargs, use_reentrant=False) + m.org_forward = org_forward m.forward = checkpointing_fwd @@ -412,130 +283,126 @@ def unpatch(m): del m.org_forward -class TrainLoraNode: +class TrainLoraNode(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": (IO.MODEL, {"tooltip": "The model to train the LoRA on."}), - "latents": ( - "LATENT", - { - "tooltip": "The Latents to use for training, serve as dataset/input of the model." - }, + def define_schema(cls): + return io.Schema( + node_id="TrainLoraNode", + display_name="Train LoRA", + category="training", + is_experimental=True, + is_input_list=True, # All inputs become lists + inputs=[ + io.Model.Input("model", tooltip="The model to train the LoRA on."), + io.Latent.Input( + "latents", + tooltip="The Latents to use for training, serve as dataset/input of the model.", ), - "positive": ( - IO.CONDITIONING, - {"tooltip": "The positive conditioning to use for training."}, + io.Conditioning.Input( + "positive", tooltip="The positive conditioning to use for training." ), - "batch_size": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 10000, - "step": 1, - "tooltip": "The batch size to use for training.", - }, + io.Int.Input( + "batch_size", + default=1, + min=1, + max=10000, + tooltip="The batch size to use for training.", ), - "grad_accumulation_steps": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 1024, - "step": 1, - "tooltip": "The number of gradient accumulation steps to use for training.", - } + io.Int.Input( + "grad_accumulation_steps", + default=1, + min=1, + max=1024, + tooltip="The number of gradient accumulation steps to use for training.", ), - "steps": ( - IO.INT, - { - "default": 16, - "min": 1, - "max": 100000, - "tooltip": "The number of steps to train the LoRA for.", - }, + io.Int.Input( + "steps", + default=16, + min=1, + max=100000, + tooltip="The number of steps to train the LoRA for.", ), - "learning_rate": ( - IO.FLOAT, - { - "default": 0.0005, - "min": 0.0000001, - "max": 1.0, - "step": 0.000001, - "tooltip": "The learning rate to use for training.", - }, + io.Float.Input( + "learning_rate", + default=0.0005, + min=0.0000001, + max=1.0, + step=0.0000001, + tooltip="The learning rate to use for training.", ), - "rank": ( - IO.INT, - { - "default": 8, - "min": 1, - "max": 128, - "tooltip": "The rank of the LoRA layers.", - }, + io.Int.Input( + "rank", + default=8, + min=1, + max=128, + tooltip="The rank of the LoRA layers.", ), - "optimizer": ( - ["AdamW", "Adam", "SGD", "RMSprop"], - { - "default": "AdamW", - "tooltip": "The optimizer to use for training.", - }, + io.Combo.Input( + "optimizer", + options=["AdamW", "Adam", "SGD", "RMSprop"], + default="AdamW", + tooltip="The optimizer to use for training.", ), - "loss_function": ( - ["MSE", "L1", "Huber", "SmoothL1"], - { - "default": "MSE", - "tooltip": "The loss function to use for training.", - }, + io.Combo.Input( + "loss_function", + options=["MSE", "L1", "Huber", "SmoothL1"], + default="MSE", + tooltip="The loss function to use for training.", ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "tooltip": "The seed to use for training (used in generator for LoRA weight initialization and noise sampling)", - }, + io.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + tooltip="The seed to use for training (used in generator for LoRA weight initialization and noise sampling)", ), - "training_dtype": ( - ["bf16", "fp32"], - {"default": "bf16", "tooltip": "The dtype to use for training."}, + io.Combo.Input( + "training_dtype", + options=["bf16", "fp32"], + default="bf16", + tooltip="The dtype to use for training.", ), - "lora_dtype": ( - ["bf16", "fp32"], - {"default": "bf16", "tooltip": "The dtype to use for lora."}, + io.Combo.Input( + "lora_dtype", + options=["bf16", "fp32"], + default="bf16", + tooltip="The dtype to use for lora.", ), - "algorithm": ( - list(adapter_maps.keys()), - {"default": list(adapter_maps.keys())[0], "tooltip": "The algorithm to use for training."}, + io.Combo.Input( + "algorithm", + options=list(adapter_maps.keys()), + default=list(adapter_maps.keys())[0], + tooltip="The algorithm to use for training.", ), - "gradient_checkpointing": ( - IO.BOOLEAN, - { - "default": True, - "tooltip": "Use gradient checkpointing for training.", - } + io.Boolean.Input( + "gradient_checkpointing", + default=True, + tooltip="Use gradient checkpointing for training.", ), - "existing_lora": ( - folder_paths.get_filename_list("loras") + ["[None]"], - { - "default": "[None]", - "tooltip": "The existing LoRA to append to. Set to None for new LoRA.", - }, + io.Combo.Input( + "existing_lora", + options=folder_paths.get_filename_list("loras") + ["[None]"], + default="[None]", + tooltip="The existing LoRA to append to. Set to None for new LoRA.", ), - }, - } + ], + outputs=[ + io.Model.Output( + display_name="model", tooltip="Model with LoRA applied" + ), + io.Custom("LORA_MODEL").Output( + display_name="lora", tooltip="LoRA weights" + ), + io.Custom("LOSS_MAP").Output( + display_name="loss_map", tooltip="Loss history" + ), + io.Int.Output(display_name="steps", tooltip="Total training steps"), + ], + ) - RETURN_TYPES = (IO.MODEL, IO.LORA_MODEL, IO.LOSS_MAP, IO.INT) - RETURN_NAMES = ("model_with_lora", "lora", "loss", "steps") - FUNCTION = "train" - CATEGORY = "training" - EXPERIMENTAL = True - - def train( - self, + @classmethod + def execute( + cls, model, latents, positive, @@ -553,13 +420,74 @@ class TrainLoraNode: gradient_checkpointing, existing_lora, ): + # Extract scalars from lists (due to is_input_list=True) + model = model[0] + batch_size = batch_size[0] + steps = steps[0] + grad_accumulation_steps = grad_accumulation_steps[0] + learning_rate = learning_rate[0] + rank = rank[0] + optimizer = optimizer[0] + loss_function = loss_function[0] + seed = seed[0] + training_dtype = training_dtype[0] + lora_dtype = lora_dtype[0] + algorithm = algorithm[0] + gradient_checkpointing = gradient_checkpointing[0] + existing_lora = existing_lora[0] + + # Handle latents - either single dict or list of dicts + if len(latents) == 1: + latents = latents[0]["samples"] # Single latent dict + else: + latent_list = [] + for latent in latents: + latent = latent["samples"] + bs = latent.shape[0] + if bs != 1: + for sub_latent in latent: + latent_list.append(sub_latent[None]) + else: + latent_list.append(latent) + latents = latent_list + + # Handle conditioning - either single list or list of lists + if len(positive) == 1: + positive = positive[0] # Single conditioning list + else: + # Multiple conditioning lists - flatten + flat_positive = [] + for cond in positive: + if isinstance(cond, list): + flat_positive.extend(cond) + else: + flat_positive.append(cond) + positive = flat_positive + mp = model.clone() dtype = node_helpers.string_to_torch_dtype(training_dtype) lora_dtype = node_helpers.string_to_torch_dtype(lora_dtype) mp.set_model_compute_dtype(dtype) - latents = latents["samples"].to(dtype) - num_images = latents.shape[0] + # latents here can be list of different size latent or one large batch + if isinstance(latents, list): + all_shapes = set() + latents = [t.to(dtype) for t in latents] + for latent in latents: + all_shapes.add(latent.shape) + logging.info(f"Latent shapes: {all_shapes}") + if len(all_shapes) > 1: + multi_res = True + else: + multi_res = False + latents = torch.cat(latents, dim=0) + num_images = len(latents) + elif isinstance(latents, torch.Tensor): + latents = latents.to(dtype) + num_images = latents.shape[0] + else: + logging.error(f"Invalid latents type: {type(latents)}") + logging.info(f"Total Images: {num_images}, Total Captions: {len(positive)}") if len(positive) == 1 and num_images > 1: positive = positive * num_images @@ -591,9 +519,7 @@ class TrainLoraNode: shape = m.weight.shape if len(shape) >= 2: alpha = float(existing_weights.get(f"{key}.alpha", 1.0)) - dora_scale = existing_weights.get( - f"{key}.dora_scale", None - ) + dora_scale = existing_weights.get(f"{key}.dora_scale", None) for adapter_cls in adapters: existing_adapter = adapter_cls.load( n, existing_weights, alpha, dora_scale @@ -605,7 +531,9 @@ class TrainLoraNode: adapter_cls = adapter_maps[algorithm] if existing_adapter is not None: - train_adapter = existing_adapter.to_train().to(lora_dtype) + train_adapter = existing_adapter.to_train().to( + lora_dtype + ) else: # Use LoRA with alpha=1.0 by default train_adapter = adapter_cls.create_train( @@ -629,7 +557,9 @@ class TrainLoraNode: if hasattr(m, "bias") and m.bias is not None: key = "{}.bias".format(n) bias = torch.nn.Parameter( - torch.zeros(m.bias.shape, dtype=lora_dtype, requires_grad=True) + torch.zeros( + m.bias.shape, dtype=lora_dtype, requires_grad=True + ) ) bias_module = BiasDiff(bias) lora_sd["{}.diff_b".format(n)] = bias @@ -657,24 +587,31 @@ class TrainLoraNode: # setup models if gradient_checkpointing: - for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model): + for m in find_all_highest_child_module_with_forward( + mp.model.diffusion_model + ): patch(m) mp.model.requires_grad_(False) - comfy.model_management.load_models_gpu([mp], memory_required=1e20, force_full_load=True) + comfy.model_management.load_models_gpu( + [mp], memory_required=1e20, force_full_load=True + ) # Setup sampler and guider like in test script loss_map = {"loss": []} + def loss_callback(loss): loss_map["loss"].append(loss) + train_sampler = TrainSampler( criterion, optimizer, loss_callback=loss_callback, batch_size=batch_size, grad_acc=grad_accumulation_steps, - total_steps=steps*grad_accumulation_steps, + total_steps=steps * grad_accumulation_steps, seed=seed, - training_dtype=dtype + training_dtype=dtype, + real_dataset=latents if multi_res else None, ) guider = comfy_extras.nodes_custom_sampler.Guider_Basic(mp) guider.set_conds(positive) # Set conditioning from input @@ -684,12 +621,15 @@ class TrainLoraNode: # Generate dummy sigmas and noise sigmas = torch.tensor(range(num_images)) noise = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(seed) + if multi_res: + # use first latent as dummy latent if multi_res + latents = latents[0].repeat(num_images, 1, 1, 1) guider.sample( noise.generate_noise({"samples": latents}), latents, train_sampler, sigmas, - seed=noise.seed + seed=noise.seed, ) finally: for m in mp.model.modules(): @@ -702,111 +642,118 @@ class TrainLoraNode: for param in lora_sd: lora_sd[param] = lora_sd[param].to(lora_dtype) - return (mp, lora_sd, loss_map, steps + existing_steps) + return io.NodeOutput(mp, lora_sd, loss_map, steps + existing_steps) -class LoraModelLoader: - def __init__(self): - self.loaded_lora = None +class LoraModelLoader(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="LoraModelLoader", + display_name="Load LoRA Model", + category="loaders", + is_experimental=True, + inputs=[ + io.Model.Input( + "model", tooltip="The diffusion model the LoRA will be applied to." + ), + io.Custom("LORA_MODEL").Input( + "lora", tooltip="The LoRA model to apply to the diffusion model." + ), + io.Float.Input( + "strength_model", + default=1.0, + min=-100.0, + max=100.0, + tooltip="How strongly to modify the diffusion model. This value can be negative.", + ), + ], + outputs=[ + io.Model.Output( + display_name="model", tooltip="The modified diffusion model." + ), + ], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ("MODEL", {"tooltip": "The diffusion model the LoRA will be applied to."}), - "lora": (IO.LORA_MODEL, {"tooltip": "The LoRA model to apply to the diffusion model."}), - "strength_model": ("FLOAT", {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01, "tooltip": "How strongly to modify the diffusion model. This value can be negative."}), - } - } - - RETURN_TYPES = ("MODEL",) - OUTPUT_TOOLTIPS = ("The modified diffusion model.",) - FUNCTION = "load_lora_model" - - CATEGORY = "loaders" - DESCRIPTION = "Load Trained LoRA weights from Train LoRA node." - EXPERIMENTAL = True - - def load_lora_model(self, model, lora, strength_model): + def execute(cls, model, lora, strength_model): if strength_model == 0: - return (model, ) + return io.NodeOutput(model) - model_lora, _ = comfy.sd.load_lora_for_models(model, None, lora, strength_model, 0) - return (model_lora, ) + model_lora, _ = comfy.sd.load_lora_for_models( + model, None, lora, strength_model, 0 + ) + return io.NodeOutput(model_lora) -class SaveLoRA: - def __init__(self): - self.output_dir = folder_paths.get_output_directory() +class SaveLoRA(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SaveLoRA", + display_name="Save LoRA Weights", + category="loaders", + is_experimental=True, + is_output_node=True, + inputs=[ + io.Custom("LORA_MODEL").Input( + "lora", + tooltip="The LoRA model to save. Do not use the model with LoRA layers.", + ), + io.String.Input( + "prefix", + default="loras/ComfyUI_trained_lora", + tooltip="The prefix to use for the saved LoRA file.", + ), + io.Int.Input( + "steps", + optional=True, + tooltip="Optional: The number of steps to LoRA has been trained for, used to name the saved file.", + ), + ], + outputs=[], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "lora": ( - IO.LORA_MODEL, - { - "tooltip": "The LoRA model to save. Do not use the model with LoRA layers." - }, - ), - "prefix": ( - "STRING", - { - "default": "loras/ComfyUI_trained_lora", - "tooltip": "The prefix to use for the saved LoRA file.", - }, - ), - }, - "optional": { - "steps": ( - IO.INT, - { - "forceInput": True, - "tooltip": "Optional: The number of steps to LoRA has been trained for, used to name the saved file.", - }, - ), - }, - } - - RETURN_TYPES = () - FUNCTION = "save" - CATEGORY = "loaders" - EXPERIMENTAL = True - OUTPUT_NODE = True - - def save(self, lora, prefix, steps=None): - full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(prefix, self.output_dir) + def execute(cls, lora, prefix, steps=None): + output_dir = folder_paths.get_output_directory() + full_output_folder, filename, counter, subfolder, filename_prefix = ( + folder_paths.get_save_image_path(prefix, output_dir) + ) if steps is None: output_checkpoint = f"{filename}_{counter:05}_.safetensors" else: output_checkpoint = f"{filename}_{steps}_steps_{counter:05}_.safetensors" output_checkpoint = os.path.join(full_output_folder, output_checkpoint) safetensors.torch.save_file(lora, output_checkpoint) - return {} + return io.NodeOutput() -class LossGraphNode: - def __init__(self): - self.output_dir = folder_paths.get_temp_directory() +class LossGraphNode(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="LossGraphNode", + display_name="Plot Loss Graph", + category="training", + is_experimental=True, + is_output_node=True, + inputs=[ + io.Custom("LOSS_MAP").Input( + "loss", tooltip="Loss map from training node." + ), + io.String.Input( + "filename_prefix", + default="loss_graph", + tooltip="Prefix for the saved loss graph image.", + ), + ], + outputs=[], + hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "loss": (IO.LOSS_MAP, {"default": {}}), - "filename_prefix": (IO.STRING, {"default": "loss_graph"}), - }, - "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, - } - - RETURN_TYPES = () - FUNCTION = "plot_loss" - OUTPUT_NODE = True - CATEGORY = "training" - EXPERIMENTAL = True - DESCRIPTION = "Plots the loss graph and saves it to the output directory." - - def plot_loss(self, loss, filename_prefix, prompt=None, extra_pnginfo=None): + def execute(cls, loss, filename_prefix, prompt=None, extra_pnginfo=None): loss_values = loss["loss"] width, height = 800, 480 margin = 40 @@ -849,47 +796,27 @@ class LossGraphNode: (margin - 30, height - 10), f"{min_loss:.2f}", font=font, fill="black" ) - metadata = None - if not args.disable_metadata: - metadata = PngInfo() - if prompt is not None: - metadata.add_text("prompt", json.dumps(prompt)) - if extra_pnginfo is not None: - for x in extra_pnginfo: - metadata.add_text(x, json.dumps(extra_pnginfo[x])) + # Convert PIL image to tensor for PreviewImage + img_array = np.array(img).astype(np.float32) / 255.0 + img_tensor = torch.from_numpy(img_array)[None,] # [1, H, W, 3] - date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - img.save( - os.path.join(self.output_dir, f"{filename_prefix}_{date}.png"), - pnginfo=metadata, - ) - return { - "ui": { - "images": [ - { - "filename": f"{filename_prefix}_{date}.png", - "subfolder": "", - "type": "temp", - } - ] - } - } + # Return preview UI + return io.NodeOutput(ui=ui.PreviewImage(img_tensor, cls=cls)) -NODE_CLASS_MAPPINGS = { - "TrainLoraNode": TrainLoraNode, - "SaveLoRANode": SaveLoRA, - "LoraModelLoader": LoraModelLoader, - "LoadImageSetFromFolderNode": LoadImageSetFromFolderNode, - "LoadImageTextSetFromFolderNode": LoadImageTextSetFromFolderNode, - "LossGraphNode": LossGraphNode, -} +# ========== Extension Setup ========== -NODE_DISPLAY_NAME_MAPPINGS = { - "TrainLoraNode": "Train LoRA", - "SaveLoRANode": "Save LoRA Weights", - "LoraModelLoader": "Load LoRA Model", - "LoadImageSetFromFolderNode": "Load Image Dataset from Folder", - "LoadImageTextSetFromFolderNode": "Load Image and Text Dataset from Folder", - "LossGraphNode": "Plot Loss Graph", -} + +class TrainingExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TrainLoraNode, + LoraModelLoader, + SaveLoRA, + LossGraphNode, + ] + + +async def comfy_entrypoint() -> TrainingExtension: + return TrainingExtension() diff --git a/nodes.py b/nodes.py index f4835c02e..bf73eb90e 100644 --- a/nodes.py +++ b/nodes.py @@ -2278,6 +2278,7 @@ async def init_builtin_extra_nodes(): "nodes_images.py", "nodes_video_model.py", "nodes_train.py", + "nodes_dataset.py", "nodes_sag.py", "nodes_perpneg.py", "nodes_stable3d.py", From eaf68c9b5bbfbcdac8988741f3948678c9465c1d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 26 Nov 2025 16:25:32 -0800 Subject: [PATCH 639/759] Make lora training work on Z Image and remove some redundant nodes. (#10927) --- comfy/ldm/lumina/model.py | 4 +- comfy_extras/nodes_dataset.py | 102 +--------------------------------- 2 files changed, 3 insertions(+), 103 deletions(-) diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index c8643eb82..565400b54 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -509,7 +509,7 @@ class NextDiT(nn.Module): if self.pad_tokens_multiple is not None: pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple - cap_feats = torch.cat((cap_feats, self.cap_pad_token.to(device=cap_feats.device, dtype=cap_feats.dtype).unsqueeze(0).repeat(cap_feats.shape[0], pad_extra, 1)), dim=1) + cap_feats = torch.cat((cap_feats, self.cap_pad_token.to(device=cap_feats.device, dtype=cap_feats.dtype, copy=True).unsqueeze(0).repeat(cap_feats.shape[0], pad_extra, 1)), dim=1) cap_pos_ids = torch.zeros(bsz, cap_feats.shape[1], 3, dtype=torch.float32, device=device) cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0 @@ -525,7 +525,7 @@ class NextDiT(nn.Module): if self.pad_tokens_multiple is not None: pad_extra = (-x.shape[1]) % self.pad_tokens_multiple - x = torch.cat((x, self.x_pad_token.to(device=x.device, dtype=x.dtype).unsqueeze(0).repeat(x.shape[0], pad_extra, 1)), dim=1) + x = torch.cat((x, self.x_pad_token.to(device=x.device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(x.shape[0], pad_extra, 1)), dim=1) x_pos_ids = torch.nn.functional.pad(x_pos_ids, (0, 0, 0, pad_extra)) freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2) diff --git a/comfy_extras/nodes_dataset.py b/comfy_extras/nodes_dataset.py index b23867505..4789d7d53 100644 --- a/comfy_extras/nodes_dataset.py +++ b/comfy_extras/nodes_dataset.py @@ -1,6 +1,5 @@ import logging import os -import math import json import numpy as np @@ -624,79 +623,6 @@ class TextProcessingNode(io.ComfyNode): # ========== Image Transform Nodes ========== -class ResizeImagesToSameSizeNode(ImageProcessingNode): - node_id = "ResizeImagesToSameSize" - display_name = "Resize Images to Same Size" - description = "Resize all images to the same width and height." - extra_inputs = [ - io.Int.Input("width", default=512, min=1, max=8192, tooltip="Target width."), - io.Int.Input("height", default=512, min=1, max=8192, tooltip="Target height."), - io.Combo.Input( - "mode", - options=["stretch", "crop_center", "pad"], - default="stretch", - tooltip="Resize mode.", - ), - ] - - @classmethod - def _process(cls, image, width, height, mode): - img = tensor_to_pil(image) - - if mode == "stretch": - img = img.resize((width, height), Image.Resampling.LANCZOS) - elif mode == "crop_center": - left = max(0, (img.width - width) // 2) - top = max(0, (img.height - height) // 2) - right = min(img.width, left + width) - bottom = min(img.height, top + height) - img = img.crop((left, top, right, bottom)) - if img.width != width or img.height != height: - img = img.resize((width, height), Image.Resampling.LANCZOS) - elif mode == "pad": - img.thumbnail((width, height), Image.Resampling.LANCZOS) - new_img = Image.new("RGB", (width, height), (0, 0, 0)) - paste_x = (width - img.width) // 2 - paste_y = (height - img.height) // 2 - new_img.paste(img, (paste_x, paste_y)) - img = new_img - - return pil_to_tensor(img) - - -class ResizeImagesToPixelCountNode(ImageProcessingNode): - node_id = "ResizeImagesToPixelCount" - display_name = "Resize Images to Pixel Count" - description = "Resize images so that the total pixel count matches the specified number while preserving aspect ratio." - extra_inputs = [ - io.Int.Input( - "pixel_count", - default=512 * 512, - min=1, - max=8192 * 8192, - tooltip="Target pixel count.", - ), - io.Int.Input( - "steps", - default=64, - min=1, - max=128, - tooltip="The stepping for resize width/height.", - ), - ] - - @classmethod - def _process(cls, image, pixel_count, steps): - img = tensor_to_pil(image) - w, h = img.size - pixel_count_ratio = math.sqrt(pixel_count / (w * h)) - new_w = int(w * pixel_count_ratio / steps) * steps - new_h = int(h * pixel_count_ratio / steps) * steps - logging.info(f"Resizing from {w}x{h} to {new_w}x{new_h}") - img = img.resize((new_w, new_h), Image.Resampling.LANCZOS) - return pil_to_tensor(img) - - class ResizeImagesByShorterEdgeNode(ImageProcessingNode): node_id = "ResizeImagesByShorterEdge" display_name = "Resize Images by Shorter Edge" @@ -801,29 +727,6 @@ class RandomCropImagesNode(ImageProcessingNode): return pil_to_tensor(img) -class FlipImagesNode(ImageProcessingNode): - node_id = "FlipImages" - display_name = "Flip Images" - description = "Flip all images horizontally or vertically." - extra_inputs = [ - io.Combo.Input( - "direction", - options=["horizontal", "vertical"], - default="horizontal", - tooltip="Flip direction.", - ), - ] - - @classmethod - def _process(cls, image, direction): - img = tensor_to_pil(image) - if direction == "horizontal": - img = img.transpose(Image.FLIP_LEFT_RIGHT) - else: - img = img.transpose(Image.FLIP_TOP_BOTTOM) - return pil_to_tensor(img) - - class NormalizeImagesNode(ImageProcessingNode): node_id = "NormalizeImages" display_name = "Normalize Images" @@ -1470,7 +1373,7 @@ class LoadTrainingDataset(io.ComfyNode): shard_path = os.path.join(dataset_dir, shard_file) with open(shard_path, "rb") as f: - shard_data = torch.load(f) + shard_data = torch.load(f, weights_only=True) all_latents.extend(shard_data["latents"]) all_conditioning.extend(shard_data["conditioning"]) @@ -1496,13 +1399,10 @@ class DatasetExtension(ComfyExtension): SaveImageDataSetToFolderNode, SaveImageTextDataSetToFolderNode, # Image transform nodes - ResizeImagesToSameSizeNode, - ResizeImagesToPixelCountNode, ResizeImagesByShorterEdgeNode, ResizeImagesByLongerEdgeNode, CenterCropImagesNode, RandomCropImagesNode, - FlipImagesNode, NormalizeImagesNode, AdjustBrightnessNode, AdjustContrastNode, From c38e7d6599be1bdce580ccfdbb20b928315af05e Mon Sep 17 00:00:00 2001 From: Haoming <73768377+Haoming02@users.noreply.github.com> Date: Thu, 27 Nov 2025 12:28:44 +0800 Subject: [PATCH 640/759] block info (#10841) --- comfy/ldm/flux/model.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 1a24e6d95..d5674dea6 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -171,7 +171,10 @@ class Flux(nn.Module): pe = None blocks_replace = patches_replace.get("dit", {}) + transformer_options["total_blocks"] = len(self.double_blocks) + transformer_options["block_type"] = "double" for i, block in enumerate(self.double_blocks): + transformer_options["block_index"] = i if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} @@ -215,7 +218,10 @@ class Flux(nn.Module): if self.params.global_modulation: vec, _ = self.single_stream_modulation(vec_orig) + transformer_options["total_blocks"] = len(self.single_blocks) + transformer_options["block_type"] = "single" for i, block in enumerate(self.single_blocks): + transformer_options["block_index"] = i if ("single_block", i) in blocks_replace: def block_wrap(args): out = {} From f17251bec65b5760cfedec29eace7d77f4b35130 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Thu, 27 Nov 2025 16:03:03 +1000 Subject: [PATCH 641/759] Account for the VRAM cost of weight offloading (#10733) * mm: default to 0 for NUM_STREAMS Dont count the compute stream as an offload stream. This makes async offload accounting easier. * mm: remove 128MB minimum This is from a previous offloading system requirement. Remove it to make behaviour of the loader and partial unloader consistent. * mp: order the module list by offload expense Calculate an approximate offloading temporary VRAM cost to offload a weight and primary order the module load list by that. In the simple case this is just the same as the module weight, but with Loras, a weight with a lora consumes considerably more VRAM to do the Lora application on-the-fly. This will slightly prioritize lora weights, but is really for proper VRAM offload accounting. * mp: Account for the VRAM cost of weight offloading when checking the VRAM headroom, assume that the weight needs to be offloaded, and only load if it has space for both the load and offload * the number of streams. As the weights are ordered from largest to smallest by offload cost this is guaranteed to fit in VRAM (tm), as all weights that follow will be smaller. Make the partial unload aware of this system as well by saving the budget for offload VRAM to the model state and accounting accordingly. Its possible that partial unload increases the size of the largest offloaded weights, and thus needs to unload a little bit more than asked to accomodate the bigger temp buffers. Honor the existing codes floor on model weight loading of 128MB by having the patcher honor this separately withough regard to offloading. Otherwise when MM specifies its 128MB minimum, MP will see the biggest weights, and budget that 128MB to only offload buffer and load nothing which isnt the intent of these minimums. The same clamp applies in case of partial offload of the currently loading model. --- comfy/model_management.py | 6 ++-- comfy/model_patcher.py | 59 +++++++++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index a9327ac80..9c403d580 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -689,7 +689,7 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu loaded_memory = loaded_model.model_loaded_memory() current_free_mem = get_free_memory(torch_dev) + loaded_memory - lowvram_model_memory = max(128 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory())) + lowvram_model_memory = max(0, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory())) lowvram_model_memory = lowvram_model_memory - loaded_memory if lowvram_model_memory == 0: @@ -1012,7 +1012,7 @@ def force_channels_last(): STREAMS = {} -NUM_STREAMS = 1 +NUM_STREAMS = 0 if args.async_offload: NUM_STREAMS = 2 logging.info("Using async weight offloading with {} streams".format(NUM_STREAMS)) @@ -1030,7 +1030,7 @@ def current_stream(device): stream_counters = {} def get_offload_stream(device): stream_counter = stream_counters.get(device, 0) - if NUM_STREAMS <= 1: + if NUM_STREAMS == 0: return None if device in STREAMS: diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 73adc7f70..3eac77275 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -148,6 +148,15 @@ class LowVramPatch: else: return out +#The above patch logic may cast up the weight to fp32, and do math. Go with fp32 x 3 +LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR = 3 + +def low_vram_patch_estimate_vram(model, key): + weight, set_func, convert_func = get_key_weight(model, key) + if weight is None: + return 0 + return weight.numel() * torch.float32.itemsize * LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR + def get_key_weight(model, key): set_func = None convert_func = None @@ -269,6 +278,9 @@ class ModelPatcher: if not hasattr(self.model, 'current_weight_patches_uuid'): self.model.current_weight_patches_uuid = None + if not hasattr(self.model, 'model_offload_buffer_memory'): + self.model.model_offload_buffer_memory = 0 + def model_size(self): if self.size > 0: return self.size @@ -662,7 +674,16 @@ class ModelPatcher: skip = True # skip random weights in non leaf modules break if not skip and (hasattr(m, "comfy_cast_weights") or len(params) > 0): - loading.append((comfy.model_management.module_size(m), n, m, params)) + module_mem = comfy.model_management.module_size(m) + module_offload_mem = module_mem + if hasattr(m, "comfy_cast_weights"): + weight_key = "{}.weight".format(n) + bias_key = "{}.bias".format(n) + if weight_key in self.patches: + module_offload_mem += low_vram_patch_estimate_vram(self.model, weight_key) + if bias_key in self.patches: + module_offload_mem += low_vram_patch_estimate_vram(self.model, bias_key) + loading.append((module_offload_mem, module_mem, n, m, params)) return loading def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False): @@ -676,20 +697,22 @@ class ModelPatcher: load_completely = [] offloaded = [] + offload_buffer = 0 loading.sort(reverse=True) for x in loading: - n = x[1] - m = x[2] - params = x[3] - module_mem = x[0] + module_offload_mem, module_mem, n, m, params = x lowvram_weight = False + potential_offload = max(offload_buffer, module_offload_mem * (comfy.model_management.NUM_STREAMS + 1)) + lowvram_fits = mem_counter + module_mem + potential_offload < lowvram_model_memory + weight_key = "{}.weight".format(n) bias_key = "{}.bias".format(n) if not full_load and hasattr(m, "comfy_cast_weights"): - if mem_counter + module_mem >= lowvram_model_memory: + if not lowvram_fits: + offload_buffer = potential_offload lowvram_weight = True lowvram_counter += 1 lowvram_mem_counter += module_mem @@ -723,9 +746,11 @@ class ModelPatcher: if hasattr(m, "comfy_cast_weights"): wipe_lowvram_weight(m) - if full_load or mem_counter + module_mem < lowvram_model_memory: + if full_load or lowvram_fits: mem_counter += module_mem load_completely.append((module_mem, n, m, params)) + else: + offload_buffer = potential_offload if cast_weight and hasattr(m, "comfy_cast_weights"): m.prev_comfy_cast_weights = m.comfy_cast_weights @@ -766,7 +791,7 @@ class ModelPatcher: self.pin_weight_to_device("{}.{}".format(n, param)) if lowvram_counter > 0: - logging.info("loaded partially; {:.2f} MB usable, {:.2f} MB loaded, {:.2f} MB offloaded, lowvram patches: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), lowvram_mem_counter / (1024 * 1024), patch_counter)) + logging.info("loaded partially; {:.2f} MB usable, {:.2f} MB loaded, {:.2f} MB offloaded, {:.2f} MB buffer reserved, lowvram patches: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), lowvram_mem_counter / (1024 * 1024), offload_buffer / (1024 * 1024), patch_counter)) self.model.model_lowvram = True else: logging.info("loaded completely; {:.2f} MB usable, {:.2f} MB loaded, full load: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load)) @@ -778,6 +803,7 @@ class ModelPatcher: self.model.lowvram_patch_counter += patch_counter self.model.device = device_to self.model.model_loaded_weight_memory = mem_counter + self.model.model_offload_buffer_memory = offload_buffer self.model.current_weight_patches_uuid = self.patches_uuid for callback in self.get_all_callbacks(CallbacksMP.ON_LOAD): @@ -831,6 +857,7 @@ class ModelPatcher: self.model.to(device_to) self.model.device = device_to self.model.model_loaded_weight_memory = 0 + self.model.model_offload_buffer_memory = 0 for m in self.model.modules(): if hasattr(m, "comfy_patched_weights"): @@ -849,13 +876,14 @@ class ModelPatcher: patch_counter = 0 unload_list = self._load_list() unload_list.sort() + offload_buffer = self.model.model_offload_buffer_memory + for unload in unload_list: - if memory_to_free < memory_freed: + if memory_to_free + offload_buffer - self.model.model_offload_buffer_memory < memory_freed: break - module_mem = unload[0] - n = unload[1] - m = unload[2] - params = unload[3] + module_offload_mem, module_mem, n, m, params = unload + + potential_offload = (comfy.model_management.NUM_STREAMS + 1) * module_offload_mem lowvram_possible = hasattr(m, "comfy_cast_weights") if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True: @@ -906,15 +934,18 @@ class ModelPatcher: m.comfy_cast_weights = True m.comfy_patched_weights = False memory_freed += module_mem + offload_buffer = max(offload_buffer, potential_offload) logging.debug("freed {}".format(n)) for param in params: self.pin_weight_to_device("{}.{}".format(n, param)) + self.model.model_lowvram = True self.model.lowvram_patch_counter += patch_counter self.model.model_loaded_weight_memory -= memory_freed - logging.info("loaded partially: {:.2f} MB loaded, lowvram patches: {}".format(self.model.model_loaded_weight_memory / (1024 * 1024), self.model.lowvram_patch_counter)) + self.model.model_offload_buffer_memory = offload_buffer + logging.info("Unloaded partially: {:.2f} MB freed, {:.2f} MB remains loaded, {:.2f} MB buffer reserved, lowvram patches: {}".format(memory_freed / (1024 * 1024), self.model.model_loaded_weight_memory / (1024 * 1024), offload_buffer / (1024 * 1024), self.model.lowvram_patch_counter)) return memory_freed def partially_load(self, device_to, extra_memory=0, force_patch_weights=False): From 3f382a4f9884f7b672557028adb9bb85d075820d Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Fri, 28 Nov 2025 02:06:30 +1000 Subject: [PATCH 642/759] quant ops: Dequantize weight in-place (#10935) In flux2 these weights are huge (200MB). As plain_tensor is a throw-away deep copy, do this multiplication in-place to save VRAM. --- comfy/quant_ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index d2f3e7397..9b924560b 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -425,7 +425,8 @@ class TensorCoreFP8Layout(QuantizedLayout): @staticmethod def dequantize(qdata, scale, orig_dtype, **kwargs): plain_tensor = torch.ops.aten._to_copy.default(qdata, dtype=orig_dtype) - return plain_tensor * scale + plain_tensor.mul_(scale) + return plain_tensor @classmethod def get_plain_tensors(cls, qtensor): From b59750a86a4687056528f1d59470e207063a73a3 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Fri, 28 Nov 2025 06:12:56 +0800 Subject: [PATCH 643/759] Update template to 0.7.23 (#10949) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9291552d3..e0b2c566b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.32.9 -comfyui-workflow-templates==0.7.20 +comfyui-workflow-templates==0.7.23 comfyui-embedded-docs==0.3.1 torch torchsde From 9d8a817985bb069685e440b38762f95dc834d242 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 27 Nov 2025 14:46:12 -0800 Subject: [PATCH 644/759] Enable async offloading by default on Nvidia. (#10953) Add --disable-async-offload to disable it. If this causes OOMs that go away when you --disable-async-offload please report it. --- comfy/cli_args.py | 3 ++- comfy/model_management.py | 13 +++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index d2b60e347..5f0dfaa10 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -131,7 +131,8 @@ vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for e parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.") -parser.add_argument("--async-offload", action="store_true", help="Use async weight offloading.") +parser.add_argument("--async-offload", nargs='?', const=2, type=int, default=None, metavar="NUM_STREAMS", help="Use async weight offloading. An optional argument controls the amount of offload streams. Default is 2. Enabled by default on Nvidia.") +parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.") parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.") diff --git a/comfy/model_management.py b/comfy/model_management.py index 9c403d580..38c506df5 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1013,8 +1013,17 @@ def force_channels_last(): STREAMS = {} NUM_STREAMS = 0 -if args.async_offload: - NUM_STREAMS = 2 +if args.async_offload is not None: + NUM_STREAMS = args.async_offload +else: + # Enable by default on Nvidia + if is_nvidia(): + NUM_STREAMS = 2 + +if args.disable_async_offload: + NUM_STREAMS = 0 + +if NUM_STREAMS > 0: logging.info("Using async weight offloading with {} streams".format(NUM_STREAMS)) def current_stream(device): From 52e778fff3c1d6f32c8d14cba9864faddba8475d Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 28 Nov 2025 12:52:59 +0200 Subject: [PATCH 645/759] feat(Kling-API-Nodes): add v2-5-turbo model to FirstLastFrame node (#10938) --- comfy_api_nodes/nodes_kling.py | 60 +++++++++++++++------------------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 36852038b..23a7f55f1 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -4,8 +4,6 @@ For source of truth on the allowed permutations of request fields, please refere - [Compatibility Table](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap) """ -from __future__ import annotations -from typing import Optional, TypeVar import math import logging @@ -66,9 +64,7 @@ from comfy_api_nodes.util import ( poll_op, ) from comfy_api.input_impl import VideoFromFile -from comfy_api.input.basic_types import AudioInput -from comfy_api.input.video_types import VideoInput -from comfy_api.latest import ComfyExtension, IO +from comfy_api.latest import ComfyExtension, IO, Input KLING_API_VERSION = "v1" PATH_TEXT_TO_VIDEO = f"/proxy/kling/{KLING_API_VERSION}/videos/text2video" @@ -94,8 +90,6 @@ AVERAGE_DURATION_IMAGE_GEN = 32 AVERAGE_DURATION_VIDEO_EFFECTS = 320 AVERAGE_DURATION_VIDEO_EXTEND = 320 -R = TypeVar("R") - MODE_TEXT2VIDEO = { "standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"), @@ -130,6 +124,8 @@ MODE_START_END_FRAME = { "pro mode / 10s duration / kling-v1-6": ("pro", "10", "kling-v1-6"), "pro mode / 5s duration / kling-v2-1": ("pro", "5", "kling-v2-1"), "pro mode / 10s duration / kling-v2-1": ("pro", "10", "kling-v2-1"), + "pro mode / 5s duration / kling-v2-5-turbo": ("pro", "5", "kling-v2-5-turbo"), + "pro mode / 10s duration / kling-v2-5-turbo": ("pro", "10", "kling-v2-5-turbo"), } """ Returns a mapping of mode strings to their corresponding (mode, duration, model_name) tuples. @@ -296,7 +292,7 @@ def get_video_from_response(response) -> KlingVideoResult: return video -def get_video_url_from_response(response) -> Optional[str]: +def get_video_url_from_response(response) -> str | None: """Returns the first video url from the Kling video generation task result. Will not raise an error if the response is not valid. """ @@ -315,7 +311,7 @@ def get_images_from_response(response) -> list[KlingImageResult]: return images -def get_images_urls_from_response(response) -> Optional[str]: +def get_images_urls_from_response(response) -> str | None: """Returns the list of image urls from the Kling image generation task result. Will not raise an error if the response is not valid. If there is only one image, returns the url as a string. If there are multiple images, returns a list of urls. """ @@ -349,7 +345,7 @@ async def execute_text2video( model_mode: str, duration: str, aspect_ratio: str, - camera_control: Optional[KlingCameraControl] = None, + camera_control: KlingCameraControl | None = None, ) -> IO.NodeOutput: validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V) task_creation_response = await sync_op( @@ -394,8 +390,8 @@ async def execute_image2video( model_mode: str, aspect_ratio: str, duration: str, - camera_control: Optional[KlingCameraControl] = None, - end_frame: Optional[torch.Tensor] = None, + camera_control: KlingCameraControl | None = None, + end_frame: torch.Tensor | None = None, ) -> IO.NodeOutput: validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_I2V) validate_input_image(start_frame) @@ -451,8 +447,8 @@ async def execute_video_effect( model_name: str, duration: KlingVideoGenDuration, image_1: torch.Tensor, - image_2: Optional[torch.Tensor] = None, - model_mode: Optional[KlingVideoGenMode] = None, + image_2: torch.Tensor | None = None, + model_mode: KlingVideoGenMode | None = None, ) -> tuple[VideoFromFile, str, str]: if dual_character: request_input_field = KlingDualCharacterEffectInput( @@ -499,13 +495,13 @@ async def execute_video_effect( async def execute_lipsync( cls: type[IO.ComfyNode], - video: VideoInput, - audio: Optional[AudioInput] = None, - voice_language: Optional[str] = None, - model_mode: Optional[str] = None, - text: Optional[str] = None, - voice_speed: Optional[float] = None, - voice_id: Optional[str] = None, + video: Input.Video, + audio: Input.Audio | None = None, + voice_language: str | None = None, + model_mode: str | None = None, + text: str | None = None, + voice_speed: float | None = None, + voice_id: str | None = None, ) -> IO.NodeOutput: if text: validate_string(text, field_name="Text", max_length=MAX_PROMPT_LENGTH_LIP_SYNC) @@ -787,7 +783,7 @@ class KlingCameraControlT2VNode(IO.ComfyNode): negative_prompt: str, cfg_scale: float, aspect_ratio: str, - camera_control: Optional[KlingCameraControl] = None, + camera_control: KlingCameraControl | None = None, ) -> IO.NodeOutput: return await execute_text2video( cls, @@ -854,8 +850,8 @@ class KlingImage2VideoNode(IO.ComfyNode): mode: str, aspect_ratio: str, duration: str, - camera_control: Optional[KlingCameraControl] = None, - end_frame: Optional[torch.Tensor] = None, + camera_control: KlingCameraControl | None = None, + end_frame: torch.Tensor | None = None, ) -> IO.NodeOutput: return await execute_image2video( cls, @@ -965,15 +961,11 @@ class KlingStartEndFrameNode(IO.ComfyNode): IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), IO.Float.Input("cfg_scale", default=0.5, min=0.0, max=1.0), - IO.Combo.Input( - "aspect_ratio", - options=[i.value for i in KlingVideoGenAspectRatio], - default="16:9", - ), + IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]), IO.Combo.Input( "mode", options=modes, - default=modes[2], + default=modes[8], tooltip="The configuration to use for the video generation following the format: mode / duration / model_name.", ), ], @@ -1254,8 +1246,8 @@ class KlingLipSyncAudioToVideoNode(IO.ComfyNode): @classmethod async def execute( cls, - video: VideoInput, - audio: AudioInput, + video: Input.Video, + audio: Input.Audio, voice_language: str, ) -> IO.NodeOutput: return await execute_lipsync( @@ -1314,7 +1306,7 @@ class KlingLipSyncTextToVideoNode(IO.ComfyNode): @classmethod async def execute( cls, - video: VideoInput, + video: Input.Video, text: str, voice: str, voice_speed: float, @@ -1471,7 +1463,7 @@ class KlingImageGenerationNode(IO.ComfyNode): human_fidelity: float, n: int, aspect_ratio: KlingImageGenAspectRatio, - image: Optional[torch.Tensor] = None, + image: torch.Tensor | None = None, ) -> IO.NodeOutput: validate_string(prompt, field_name="prompt", min_length=1, max_length=MAX_PROMPT_LENGTH_IMAGE_GEN) validate_string(negative_prompt, field_name="negative_prompt", max_length=MAX_PROMPT_LENGTH_IMAGE_GEN) From ca7808f240d4d53e594d3b95753240313864c992 Mon Sep 17 00:00:00 2001 From: "Dr.Lt.Data" <128333288+ltdrdata@users.noreply.github.com> Date: Sat, 29 Nov 2025 05:43:17 +0900 Subject: [PATCH 646/759] fix(user_manager): fix typo in move_userdata dest validation (#10967) Check `dest` instead of `source` when validating destination path in move_userdata endpoint. --- app/user_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/user_manager.py b/app/user_manager.py index a2d376c0c..675f6c0c6 100644 --- a/app/user_manager.py +++ b/app/user_manager.py @@ -424,7 +424,7 @@ class UserManager(): return source dest = get_user_data_path(request, check_exists=False, param="dest") - if not isinstance(source, str): + if not isinstance(dest, str): return dest overwrite = request.query.get("overwrite", 'true') != "false" From f55c98a89f76fc06c435a728bc2e76b6b4051463 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 28 Nov 2025 13:16:46 -0800 Subject: [PATCH 647/759] Disable offload stream when torch compile. (#10961) --- comfy/model_management.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy/model_management.py b/comfy/model_management.py index 38c506df5..d8ce80010 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1042,6 +1042,9 @@ def get_offload_stream(device): if NUM_STREAMS == 0: return None + if torch.compiler.is_compiling(): + return None + if device in STREAMS: ss = STREAMS[device] #Sync the oldest stream in the queue with the current From 6484ac89dc683b178d9ef3f7406800f7132147ba Mon Sep 17 00:00:00 2001 From: Urle Sistiana <55231606+urlesistiana@users.noreply.github.com> Date: Sat, 29 Nov 2025 05:33:07 +0800 Subject: [PATCH 648/759] fix QuantizedTensor.is_contiguous (#10956) (#10959) --- comfy/quant_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index 9b924560b..bb1fb860c 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -235,8 +235,8 @@ class QuantizedTensor(torch.Tensor): def is_pinned(self): return self._qdata.is_pinned() - def is_contiguous(self): - return self._qdata.is_contiguous() + def is_contiguous(self, *arg, **kwargs): + return self._qdata.is_contiguous(*arg, **kwargs) # ============================================================================== # Generic Utilities (Layout-Agnostic Operations) From 0ff0457892467ef8a6ea235dcd0618c10ca44ee3 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Sat, 29 Nov 2025 07:38:12 +1000 Subject: [PATCH 649/759] mm: wrap the raw stream in context manager (#10958) The documentation of torch.foo.Stream being usable with with: suggests it starts at version 2.7. Use the old API for backwards compatibility. --- comfy/model_management.py | 19 +++++++++++++++---- comfy/ops.py | 2 ++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index d8ce80010..aeddbaefe 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1055,7 +1055,9 @@ def get_offload_stream(device): elif is_device_cuda(device): ss = [] for k in range(NUM_STREAMS): - ss.append(torch.cuda.Stream(device=device, priority=0)) + s1 = torch.cuda.Stream(device=device, priority=0) + s1.as_context = torch.cuda.stream + ss.append(s1) STREAMS[device] = ss s = ss[stream_counter] stream_counters[device] = stream_counter @@ -1063,7 +1065,9 @@ def get_offload_stream(device): elif is_device_xpu(device): ss = [] for k in range(NUM_STREAMS): - ss.append(torch.xpu.Stream(device=device, priority=0)) + s1 = torch.xpu.Stream(device=device, priority=0) + s1.as_context = torch.xpu.stream + ss.append(s1) STREAMS[device] = ss s = ss[stream_counter] stream_counters[device] = stream_counter @@ -1081,12 +1085,19 @@ def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, str if dtype is None or weight.dtype == dtype: return weight if stream is not None: - with stream: + wf_context = stream + if hasattr(wf_context, "as_context"): + wf_context = wf_context.as_context(stream) + with wf_context: return weight.to(dtype=dtype, copy=copy) return weight.to(dtype=dtype, copy=copy) + if stream is not None: - with stream: + wf_context = stream + if hasattr(wf_context, "as_context"): + wf_context = wf_context.as_context(stream) + with wf_context: r = torch.empty_like(weight, dtype=dtype, device=device) r.copy_(weight, non_blocking=non_blocking) else: diff --git a/comfy/ops.py b/comfy/ops.py index a0ff4e8f1..61a2f0754 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -95,6 +95,8 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of if offload_stream is not None: wf_context = offload_stream + if hasattr(wf_context, "as_context"): + wf_context = wf_context.as_context(offload_stream) else: wf_context = contextlib.nullcontext() From 065a2fbbec6af5c8e19a3add29703f83faf672d6 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 28 Nov 2025 16:37:39 -0800 Subject: [PATCH 650/759] Update driver link in AMD portable README (#10974) --- .ci/windows_amd_base_files/README_VERY_IMPORTANT.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt index 96a500be2..2cbb00d99 100755 --- a/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt +++ b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt @@ -1,5 +1,5 @@ -As of the time of writing this you need this preview driver for best results: -https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOWS-PYTORCH-PREVIEW.html +As of the time of writing this you need this driver for best results: +https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOWS-PYTORCH-7-1-1.html HOW TO RUN: @@ -25,3 +25,4 @@ In the ComfyUI directory you will find a file: extra_model_paths.yaml.example Rename this file to: extra_model_paths.yaml and edit it with your favorite text editor. + From b9070857092a78cc952d70025fdcc0ff540d96ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Sat, 29 Nov 2025 02:40:19 +0200 Subject: [PATCH 651/759] Support video tiny VAEs (#10884) * Support video tiny VAEs * lighttaew scaling fix * Also support video taes in previews Only first frame for now as live preview playback is currently only available through VHS custom nodes. * Support Wan 2.1 lightVAE * Relocate elif block and set Wan VAE dim directly without using pruning rate for lightvae --- comfy/latent_formats.py | 5 +- comfy/sd.py | 34 +++++++- comfy/taesd/taehv.py | 171 ++++++++++++++++++++++++++++++++++++++++ latent_preview.py | 28 +++++-- nodes.py | 18 ++++- 5 files changed, 244 insertions(+), 12 deletions(-) create mode 100644 comfy/taesd/taehv.py diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 8e110f45d..f1ca0151e 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -431,6 +431,7 @@ class HunyuanVideo(LatentFormat): ] latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761] + taesd_decoder_name = "taehv" class Cosmos1CV8x8x8(LatentFormat): latent_channels = 16 @@ -494,7 +495,7 @@ class Wan21(LatentFormat): ]).view(1, self.latent_channels, 1, 1, 1) - self.taesd_decoder_name = None #TODO + self.taesd_decoder_name = "lighttaew2_1" def process_in(self, latent): latents_mean = self.latents_mean.to(latent.device, latent.dtype) @@ -565,6 +566,7 @@ class Wan22(Wan21): def __init__(self): self.scale_factor = 1.0 + self.taesd_decoder_name = "lighttaew2_2" self.latents_mean = torch.tensor([ -0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557, -0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825, @@ -719,6 +721,7 @@ class HunyuanVideo15(LatentFormat): latent_channels = 32 latent_dimensions = 3 scale_factor = 1.03682 + taesd_decoder_name = "lighttaehy1_5" class Hunyuan3Dv2(LatentFormat): latent_channels = 64 diff --git a/comfy/sd.py b/comfy/sd.py index 350fae92b..9eeb0c45a 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -60,6 +60,8 @@ import comfy.lora_convert import comfy.hooks import comfy.t2i_adapter.adapter import comfy.taesd.taesd +import comfy.taesd.taehv +import comfy.latent_formats import comfy.ldm.flux.redux @@ -508,13 +510,14 @@ class VAE: self.memory_used_encode = lambda shape, dtype: 3300 * shape[3] * shape[4] * model_management.dtype_size(dtype) self.memory_used_decode = lambda shape, dtype: 8000 * shape[3] * shape[4] * (16 * 16) * model_management.dtype_size(dtype) else: # Wan 2.1 VAE + dim = sd["decoder.head.0.gamma"].shape[0] self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8) self.upscale_index_formula = (4, 8, 8) self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8) self.downscale_index_formula = (4, 8, 8) self.latent_dim = 3 self.latent_channels = 16 - ddconfig = {"dim": 96, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0} + ddconfig = {"dim": dim, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0} self.first_stage_model = comfy.ldm.wan.vae.WanVAE(**ddconfig) self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype) @@ -584,6 +587,35 @@ class VAE: self.process_input = lambda audio: audio self.working_dtypes = [torch.float32] self.crop_input = False + elif "decoder.22.bias" in sd: # taehv, taew and lighttae + self.latent_channels = sd["decoder.1.weight"].shape[1] + self.latent_dim = 3 + self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16) + self.upscale_index_formula = (4, 16, 16) + self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16) + self.downscale_index_formula = (4, 16, 16) + if self.latent_channels == 48: # Wan 2.2 + self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=None) # taehv doesn't need scaling + self.process_input = lambda image: (_ for _ in ()).throw(NotImplementedError("This light tae doesn't support encoding currently")) + self.process_output = lambda image: image + self.memory_used_decode = lambda shape, dtype: (1800 * (max(1, (shape[-3] ** 0.7 * 0.1)) * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)) + elif self.latent_channels == 32 and sd["decoder.22.bias"].shape[0] == 12: # lighttae_hv15 + self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=comfy.latent_formats.HunyuanVideo15) + self.process_input = lambda image: (_ for _ in ()).throw(NotImplementedError("This light tae doesn't support encoding currently")) + self.memory_used_decode = lambda shape, dtype: (1200 * (max(1, (shape[-3] ** 0.7 * 0.05)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype)) + else: + if sd["decoder.1.weight"].dtype == torch.float16: # taehv currently only available in float16, so assume it's not lighttaew2_1 as otherwise state dicts are identical + latent_format=comfy.latent_formats.HunyuanVideo + else: + latent_format=None # lighttaew2_1 doesn't need scaling + self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=latent_format) + self.process_input = self.process_output = lambda image: image + self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8) + self.upscale_index_formula = (4, 8, 8) + self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8) + self.downscale_index_formula = (4, 8, 8) + self.memory_used_encode = lambda shape, dtype: (700 * (max(1, (shape[-3] ** 0.66 * 0.11)) * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)) + self.memory_used_decode = lambda shape, dtype: (50 * (max(1, (shape[-3] ** 0.65 * 0.26)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype)) else: logging.warning("WARNING: No VAE weights detected, VAE not initalized.") self.first_stage_model = None diff --git a/comfy/taesd/taehv.py b/comfy/taesd/taehv.py new file mode 100644 index 000000000..3dfe1e4d4 --- /dev/null +++ b/comfy/taesd/taehv.py @@ -0,0 +1,171 @@ +# Tiny AutoEncoder for HunyuanVideo and WanVideo https://github.com/madebyollin/taehv + +import torch +import torch.nn as nn +import torch.nn.functional as F +from tqdm.auto import tqdm +from collections import namedtuple, deque + +import comfy.ops +operations=comfy.ops.disable_weight_init + +DecoderResult = namedtuple("DecoderResult", ("frame", "memory")) +TWorkItem = namedtuple("TWorkItem", ("input_tensor", "block_index")) + +def conv(n_in, n_out, **kwargs): + return operations.Conv2d(n_in, n_out, 3, padding=1, **kwargs) + +class Clamp(nn.Module): + def forward(self, x): + return torch.tanh(x / 3) * 3 + +class MemBlock(nn.Module): + def __init__(self, n_in, n_out, act_func): + super().__init__() + self.conv = nn.Sequential(conv(n_in * 2, n_out), act_func, conv(n_out, n_out), act_func, conv(n_out, n_out)) + self.skip = operations.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity() + self.act = act_func + def forward(self, x, past): + return self.act(self.conv(torch.cat([x, past], 1)) + self.skip(x)) + +class TPool(nn.Module): + def __init__(self, n_f, stride): + super().__init__() + self.stride = stride + self.conv = operations.Conv2d(n_f*stride,n_f, 1, bias=False) + def forward(self, x): + _NT, C, H, W = x.shape + return self.conv(x.reshape(-1, self.stride * C, H, W)) + +class TGrow(nn.Module): + def __init__(self, n_f, stride): + super().__init__() + self.stride = stride + self.conv = operations.Conv2d(n_f, n_f*stride, 1, bias=False) + def forward(self, x): + _NT, C, H, W = x.shape + x = self.conv(x) + return x.reshape(-1, C, H, W) + +def apply_model_with_memblocks(model, x, parallel, show_progress_bar): + + B, T, C, H, W = x.shape + if parallel: + x = x.reshape(B*T, C, H, W) + # parallel over input timesteps, iterate over blocks + for b in tqdm(model, disable=not show_progress_bar): + if isinstance(b, MemBlock): + BT, C, H, W = x.shape + T = BT // B + _x = x.reshape(B, T, C, H, W) + mem = F.pad(_x, (0,0,0,0,0,0,1,0), value=0)[:,:T].reshape(x.shape) + x = b(x, mem) + else: + x = b(x) + BT, C, H, W = x.shape + T = BT // B + x = x.view(B, T, C, H, W) + else: + out = [] + work_queue = deque([TWorkItem(xt, 0) for t, xt in enumerate(x.reshape(B, T * C, H, W).chunk(T, dim=1))]) + progress_bar = tqdm(range(T), disable=not show_progress_bar) + mem = [None] * len(model) + while work_queue: + xt, i = work_queue.popleft() + if i == 0: + progress_bar.update(1) + if i == len(model): + out.append(xt) + del xt + else: + b = model[i] + if isinstance(b, MemBlock): + if mem[i] is None: + xt_new = b(xt, xt * 0) + mem[i] = xt.detach().clone() + else: + xt_new = b(xt, mem[i]) + mem[i] = xt.detach().clone() + del xt + work_queue.appendleft(TWorkItem(xt_new, i+1)) + elif isinstance(b, TPool): + if mem[i] is None: + mem[i] = [] + mem[i].append(xt.detach().clone()) + if len(mem[i]) == b.stride: + B, C, H, W = xt.shape + xt = b(torch.cat(mem[i], 1).view(B*b.stride, C, H, W)) + mem[i] = [] + work_queue.appendleft(TWorkItem(xt, i+1)) + elif isinstance(b, TGrow): + xt = b(xt) + NT, C, H, W = xt.shape + for xt_next in reversed(xt.view(B, b.stride*C, H, W).chunk(b.stride, 1)): + work_queue.appendleft(TWorkItem(xt_next, i+1)) + del xt + else: + xt = b(xt) + work_queue.appendleft(TWorkItem(xt, i+1)) + progress_bar.close() + x = torch.stack(out, 1) + return x + + +class TAEHV(nn.Module): + def __init__(self, latent_channels, parallel=False, decoder_time_upscale=(True, True), decoder_space_upscale=(True, True, True), latent_format=None, show_progress_bar=True): + super().__init__() + self.image_channels = 3 + self.patch_size = 1 + self.latent_channels = latent_channels + self.parallel = parallel + self.latent_format = latent_format + self.show_progress_bar = show_progress_bar + self.process_in = latent_format().process_in if latent_format is not None else (lambda x: x) + self.process_out = latent_format().process_out if latent_format is not None else (lambda x: x) + if self.latent_channels in [48, 32]: # Wan 2.2 and HunyuanVideo1.5 + self.patch_size = 2 + if self.latent_channels == 32: # HunyuanVideo1.5 + act_func = nn.LeakyReLU(0.2, inplace=True) + else: # HunyuanVideo, Wan 2.1 + act_func = nn.ReLU(inplace=True) + + self.encoder = nn.Sequential( + conv(self.image_channels*self.patch_size**2, 64), act_func, + TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), + TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), + TPool(64, 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), + conv(64, self.latent_channels), + ) + n_f = [256, 128, 64, 64] + self.frames_to_trim = 2**sum(decoder_time_upscale) - 1 + self.decoder = nn.Sequential( + Clamp(), conv(self.latent_channels, n_f[0]), act_func, + MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[0] else 1), TGrow(n_f[0], 1), conv(n_f[0], n_f[1], bias=False), + MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[1] else 1), TGrow(n_f[1], 2 if decoder_time_upscale[0] else 1), conv(n_f[1], n_f[2], bias=False), + MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[2] else 1), TGrow(n_f[2], 2 if decoder_time_upscale[1] else 1), conv(n_f[2], n_f[3], bias=False), + act_func, conv(n_f[3], self.image_channels*self.patch_size**2), + ) + @property + def show_progress_bar(self): + return self._show_progress_bar + + @show_progress_bar.setter + def show_progress_bar(self, value): + self._show_progress_bar = value + + def encode(self, x, **kwargs): + if self.patch_size > 1: x = F.pixel_unshuffle(x, self.patch_size) + x = x.movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W] + if x.shape[1] % 4 != 0: + # pad at end to multiple of 4 + n_pad = 4 - x.shape[1] % 4 + padding = x[:, -1:].repeat_interleave(n_pad, dim=1) + x = torch.cat([x, padding], 1) + x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar).movedim(2, 1) + return self.process_out(x) + + def decode(self, x, **kwargs): + x = self.process_in(x).movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W] + x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar) + if self.patch_size > 1: x = F.pixel_shuffle(x, self.patch_size) + return x[:, self.frames_to_trim:].movedim(2, 1) diff --git a/latent_preview.py b/latent_preview.py index ddf6dcf49..66bded4b9 100644 --- a/latent_preview.py +++ b/latent_preview.py @@ -2,17 +2,24 @@ import torch from PIL import Image from comfy.cli_args import args, LatentPreviewMethod from comfy.taesd.taesd import TAESD +from comfy.sd import VAE import comfy.model_management import folder_paths import comfy.utils import logging MAX_PREVIEW_RESOLUTION = args.preview_size +VIDEO_TAES = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5"] -def preview_to_image(latent_image): - latents_ubyte = (((latent_image + 1.0) / 2.0).clamp(0, 1) # change scale from -1..1 to 0..1 - .mul(0xFF) # to 0..255 - ) +def preview_to_image(latent_image, do_scale=True): + if do_scale: + latents_ubyte = (((latent_image + 1.0) / 2.0).clamp(0, 1) # change scale from -1..1 to 0..1 + .mul(0xFF) # to 0..255 + ) + else: + latents_ubyte = (latent_image.clamp(0, 1) + .mul(0xFF) # to 0..255 + ) if comfy.model_management.directml_enabled: latents_ubyte = latents_ubyte.to(dtype=torch.uint8) latents_ubyte = latents_ubyte.to(device="cpu", dtype=torch.uint8, non_blocking=comfy.model_management.device_supports_non_blocking(latent_image.device)) @@ -35,6 +42,10 @@ class TAESDPreviewerImpl(LatentPreviewer): x_sample = self.taesd.decode(x0[:1])[0].movedim(0, 2) return preview_to_image(x_sample) +class TAEHVPreviewerImpl(TAESDPreviewerImpl): + def decode_latent_to_preview(self, x0): + x_sample = self.taesd.decode(x0[:1, :, :1])[0][0] + return preview_to_image(x_sample, do_scale=False) class Latent2RGBPreviewer(LatentPreviewer): def __init__(self, latent_rgb_factors, latent_rgb_factors_bias=None, latent_rgb_factors_reshape=None): @@ -81,8 +92,13 @@ def get_previewer(device, latent_format): if method == LatentPreviewMethod.TAESD: if taesd_decoder_path: - taesd = TAESD(None, taesd_decoder_path, latent_channels=latent_format.latent_channels).to(device) - previewer = TAESDPreviewerImpl(taesd) + if latent_format.taesd_decoder_name in VIDEO_TAES: + taesd = VAE(comfy.utils.load_torch_file(taesd_decoder_path)) + taesd.first_stage_model.show_progress_bar = False + previewer = TAEHVPreviewerImpl(taesd) + else: + taesd = TAESD(None, taesd_decoder_path, latent_channels=latent_format.latent_channels).to(device) + previewer = TAESDPreviewerImpl(taesd) else: logging.warning("Warning: TAESD previews enabled, but could not find models/vae_approx/{}".format(latent_format.taesd_decoder_name)) diff --git a/nodes.py b/nodes.py index bf73eb90e..495dec806 100644 --- a/nodes.py +++ b/nodes.py @@ -692,8 +692,10 @@ class LoraLoaderModelOnly(LoraLoader): return (self.load_lora(model, None, lora_name, strength_model, 0)[0],) class VAELoader: + video_taes = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5"] + image_taes = ["taesd", "taesdxl", "taesd3", "taef1"] @staticmethod - def vae_list(): + def vae_list(s): vaes = folder_paths.get_filename_list("vae") approx_vaes = folder_paths.get_filename_list("vae_approx") sdxl_taesd_enc = False @@ -722,6 +724,11 @@ class VAELoader: f1_taesd_dec = True elif v.startswith("taef1_decoder."): f1_taesd_enc = True + else: + for tae in s.video_taes: + if v.startswith(tae): + vaes.append(v) + if sd1_taesd_dec and sd1_taesd_enc: vaes.append("taesd") if sdxl_taesd_dec and sdxl_taesd_enc: @@ -765,7 +772,7 @@ class VAELoader: @classmethod def INPUT_TYPES(s): - return {"required": { "vae_name": (s.vae_list(), )}} + return {"required": { "vae_name": (s.vae_list(s), )}} RETURN_TYPES = ("VAE",) FUNCTION = "load_vae" @@ -776,10 +783,13 @@ class VAELoader: if vae_name == "pixel_space": sd = {} sd["pixel_space_vae"] = torch.tensor(1.0) - elif vae_name in ["taesd", "taesdxl", "taesd3", "taef1"]: + elif vae_name in self.image_taes: sd = self.load_taesd(vae_name) else: - vae_path = folder_paths.get_full_path_or_raise("vae", vae_name) + if os.path.splitext(vae_name)[0] in self.video_taes: + vae_path = folder_paths.get_full_path_or_raise("vae_approx", vae_name) + else: + vae_path = folder_paths.get_full_path_or_raise("vae", vae_name) sd = comfy.utils.load_torch_file(vae_path) vae = comfy.sd.VAE(sd=sd) vae.throw_exception_if_invalid() From 52a32e2b323b90295ab05a8c299590c890f2ecb6 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 28 Nov 2025 18:12:42 -0800 Subject: [PATCH 652/759] Support some z image lora formats. (#10978) --- comfy/lora.py | 8 ++++++++ comfy/utils.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/comfy/lora.py b/comfy/lora.py index 36d26293a..360cd128f 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -313,6 +313,14 @@ def model_lora_keys_unet(model, key_map={}): key_map["transformer.{}".format(key_lora)] = k key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k #SimpleTuner lycoris format + if isinstance(model, comfy.model_base.Lumina2): + diffusers_keys = comfy.utils.z_image_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.") + for k in diffusers_keys: + to = diffusers_keys[k] + key_lora = k[:-len(".weight")] + key_map["diffusion_model.{}".format(key_lora)] = to + key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = to + return key_map diff --git a/comfy/utils.py b/comfy/utils.py index 4bd281057..21bd6e8cf 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -675,6 +675,51 @@ def flux_to_diffusers(mmdit_config, output_prefix=""): return key_map +def z_image_to_diffusers(mmdit_config, output_prefix=""): + n_layers = mmdit_config.get("n_layers", 0) + hidden_size = mmdit_config.get("dim", 0) + + key_map = {} + + for index in range(n_layers): + prefix_from = "layers.{}".format(index) + prefix_to = "{}layers.{}".format(output_prefix, index) + + for end in ("weight", "bias"): + k = "{}.attention.".format(prefix_from) + qkv = "{}.attention.qkv.{}".format(prefix_to, end) + + key_map["{}to_q.{}".format(k, end)] = (qkv, (0, 0, hidden_size)) + key_map["{}to_k.{}".format(k, end)] = (qkv, (0, hidden_size, hidden_size)) + key_map["{}to_v.{}".format(k, end)] = (qkv, (0, hidden_size * 2, hidden_size)) + + block_map = { + "attention.norm_q.weight": "attention.q_norm.weight", + "attention.norm_k.weight": "attention.k_norm.weight", + "attention.to_out.0.weight": "attention.out.weight", + "attention.to_out.0.bias": "attention.out.bias", + } + + for k in block_map: + key_map["{}.{}".format(prefix_from, k)] = "{}.{}".format(prefix_to, block_map[k]) + + MAP_BASIC = { + # Final layer + ("final_layer.linear.weight", "all_final_layer.2-1.linear.weight"), + ("final_layer.linear.bias", "all_final_layer.2-1.linear.bias"), + ("final_layer.adaLN_modulation.1.weight", "all_final_layer.2-1.adaLN_modulation.1.weight"), + ("final_layer.adaLN_modulation.1.bias", "all_final_layer.2-1.adaLN_modulation.1.bias"), + # X embedder + ("x_embedder.weight", "all_x_embedder.2-1.weight"), + ("x_embedder.bias", "all_x_embedder.2-1.bias"), + } + + for k in MAP_BASIC: + key_map[k[1]] = "{}{}".format(output_prefix, k[0]) + + return key_map + + def repeat_to_batch_size(tensor, batch_size, dim=0): if tensor.shape[dim] > batch_size: return tensor.narrow(dim, 0, batch_size) From af96d9812de3e420abd43275d9a5960535b6333c Mon Sep 17 00:00:00 2001 From: "Dr.Lt.Data" <128333288+ltdrdata@users.noreply.github.com> Date: Sat, 29 Nov 2025 11:28:42 +0900 Subject: [PATCH 653/759] feat(security): add System User protection with `__` prefix (#10966) * feat(security): add System User protection with `__` prefix Add protected namespace for custom nodes to store sensitive data (API keys, licenses) that cannot be accessed via HTTP endpoints. Key changes: - New API: get_system_user_directory() for internal access - New API: get_public_user_directory() with structural blocking - 3-layer defense: header validation, path blocking, creation prevention - 54 tests covering security, edge cases, and backward compatibility System Users use `__` prefix (e.g., __system, __cache) following Python's private member convention. They exist in user_directory/ but are completely blocked from /userdata HTTP endpoints. * style: remove unused imports --- app/user_manager.py | 21 +- folder_paths.py | 65 +++ .../app_test/user_manager_system_user_test.py | 193 +++++++++ .../folder_paths_test/system_user_test.py | 206 ++++++++++ .../system_user_endpoint_test.py | 375 ++++++++++++++++++ 5 files changed, 855 insertions(+), 5 deletions(-) create mode 100644 tests-unit/app_test/user_manager_system_user_test.py create mode 100644 tests-unit/folder_paths_test/system_user_test.py create mode 100644 tests-unit/prompt_server_test/system_user_endpoint_test.py diff --git a/app/user_manager.py b/app/user_manager.py index 675f6c0c6..e2c00dab2 100644 --- a/app/user_manager.py +++ b/app/user_manager.py @@ -59,6 +59,9 @@ class UserManager(): user = "default" if args.multi_user and "comfy-user" in request.headers: user = request.headers["comfy-user"] + # Block System Users (use same error message to prevent probing) + if user.startswith(folder_paths.SYSTEM_USER_PREFIX): + raise KeyError("Unknown user: " + user) if user not in self.users: raise KeyError("Unknown user: " + user) @@ -66,15 +69,16 @@ class UserManager(): return user def get_request_user_filepath(self, request, file, type="userdata", create_dir=True): - user_directory = folder_paths.get_user_directory() - if type == "userdata": - root_dir = user_directory + root_dir = folder_paths.get_user_directory() else: raise KeyError("Unknown filepath type:" + type) user = self.get_request_user_id(request) - path = user_root = os.path.abspath(os.path.join(root_dir, user)) + user_root = folder_paths.get_public_user_directory(user) + if user_root is None: + return None + path = user_root # prevent leaving /{type} if os.path.commonpath((root_dir, user_root)) != root_dir: @@ -101,7 +105,11 @@ class UserManager(): name = name.strip() if not name: raise ValueError("username not provided") + if name.startswith(folder_paths.SYSTEM_USER_PREFIX): + raise ValueError("System User prefix not allowed") user_id = re.sub("[^a-zA-Z0-9-_]+", '-', name) + if user_id.startswith(folder_paths.SYSTEM_USER_PREFIX): + raise ValueError("System User prefix not allowed") user_id = user_id + "_" + str(uuid.uuid4()) self.users[user_id] = name @@ -132,7 +140,10 @@ class UserManager(): if username in self.users.values(): return web.json_response({"error": "Duplicate username."}, status=400) - user_id = self.add_user(username) + try: + user_id = self.add_user(username) + except ValueError as e: + return web.json_response({"error": str(e)}, status=400) return web.json_response(user_id) @routes.get("/userdata") diff --git a/folder_paths.py b/folder_paths.py index ffdc4d020..9c96540e3 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -137,6 +137,71 @@ def set_user_directory(user_dir: str) -> None: user_directory = user_dir +# System User Protection - Protects system directories from HTTP endpoint access +# System Users are internal-only users that cannot be accessed via HTTP endpoints. +# They use the '__' prefix convention (similar to Python's private member convention). +SYSTEM_USER_PREFIX = "__" + + +def get_system_user_directory(name: str = "system") -> str: + """ + Get the path to a System User directory. + + System User directories (prefixed with '__') are only accessible via internal API, + not through HTTP endpoints. Use this for storing system-internal data that + should not be exposed to users. + + Args: + name: System user name (e.g., "system", "cache"). Must be alphanumeric + with underscores allowed, but cannot start with underscore. + + Returns: + Absolute path to the system user directory. + + Raises: + ValueError: If name is empty, invalid, or starts with underscore. + + Example: + >>> get_system_user_directory("cache") + '/path/to/user/__cache' + """ + if not name or not isinstance(name, str): + raise ValueError("System user name cannot be empty") + if not name.replace("_", "").isalnum(): + raise ValueError(f"Invalid system user name: '{name}'") + if name.startswith("_"): + raise ValueError("System user name should not start with underscore") + return os.path.join(get_user_directory(), f"{SYSTEM_USER_PREFIX}{name}") + + +def get_public_user_directory(user_id: str) -> str | None: + """ + Get the path to a Public User directory for HTTP endpoint access. + + This function provides structural security by returning None for any + System User (prefixed with '__'). All HTTP endpoints should use this + function instead of directly constructing user paths. + + Args: + user_id: User identifier from HTTP request. + + Returns: + Absolute path to the user directory, or None if user_id is invalid + or refers to a System User. + + Example: + >>> get_public_user_directory("default") + '/path/to/user/default' + >>> get_public_user_directory("__system") + None + """ + if not user_id or not isinstance(user_id, str): + return None + if user_id.startswith(SYSTEM_USER_PREFIX): + return None + return os.path.join(get_user_directory(), user_id) + + #NOTE: used in http server so don't put folders that should not be accessed remotely def get_directory_by_type(type_name: str) -> str | None: if type_name == "output": diff --git a/tests-unit/app_test/user_manager_system_user_test.py b/tests-unit/app_test/user_manager_system_user_test.py new file mode 100644 index 000000000..63b1ac5e5 --- /dev/null +++ b/tests-unit/app_test/user_manager_system_user_test.py @@ -0,0 +1,193 @@ +"""Tests for System User Protection in user_manager.py + +Tests cover: +- get_request_user_id(): 1st defense layer - blocks System Users from HTTP headers +- get_request_user_filepath(): 2nd defense layer - structural blocking via get_public_user_directory() +- add_user(): 3rd defense layer - prevents creation of System User names +- Defense layers integration tests +""" + +import pytest +from unittest.mock import MagicMock, patch +import tempfile + +import folder_paths +from app.user_manager import UserManager + + +@pytest.fixture +def mock_user_directory(): + """Create a temporary user directory.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_dir = folder_paths.get_user_directory() + folder_paths.set_user_directory(temp_dir) + yield temp_dir + folder_paths.set_user_directory(original_dir) + + +@pytest.fixture +def user_manager(mock_user_directory): + """Create a UserManager instance for testing.""" + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + manager = UserManager() + # Add a default user for testing + manager.users = {"default": "default", "test_user_123": "Test User"} + yield manager + + +@pytest.fixture +def mock_request(): + """Create a mock request object.""" + request = MagicMock() + request.headers = {} + return request + + +class TestGetRequestUserId: + """Tests for get_request_user_id() - 1st defense layer. + + Verifies: + - System Users (__ prefix) in HTTP header are rejected with KeyError + - Public Users pass through successfully + """ + + def test_system_user_raises_error(self, user_manager, mock_request): + """Test System User in header raises KeyError.""" + mock_request.headers = {"comfy-user": "__system"} + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + with pytest.raises(KeyError, match="Unknown user"): + user_manager.get_request_user_id(mock_request) + + def test_system_user_cache_raises_error(self, user_manager, mock_request): + """Test System User cache raises KeyError.""" + mock_request.headers = {"comfy-user": "__cache"} + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + with pytest.raises(KeyError, match="Unknown user"): + user_manager.get_request_user_id(mock_request) + + def test_normal_user_works(self, user_manager, mock_request): + """Test normal user access works.""" + mock_request.headers = {"comfy-user": "default"} + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + user_id = user_manager.get_request_user_id(mock_request) + assert user_id == "default" + + def test_unknown_user_raises_error(self, user_manager, mock_request): + """Test unknown user raises KeyError.""" + mock_request.headers = {"comfy-user": "unknown_user"} + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + with pytest.raises(KeyError, match="Unknown user"): + user_manager.get_request_user_id(mock_request) + + +class TestGetRequestUserFilepath: + """Tests for get_request_user_filepath() - 2nd defense layer. + + Verifies: + - Returns None when get_public_user_directory() returns None (System User) + - Acts as backup defense if 1st layer is bypassed + """ + + def test_system_user_returns_none(self, user_manager, mock_request, mock_user_directory): + """Test System User returns None (structural blocking).""" + # First, we need to mock get_request_user_id to return System User + # But actually, get_request_user_id will raise KeyError first + # So we test via get_public_user_directory returning None + mock_request.headers = {"comfy-user": "default"} + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + # Patch get_public_user_directory to return None for testing + with patch.object(folder_paths, 'get_public_user_directory', return_value=None): + result = user_manager.get_request_user_filepath(mock_request, "test.txt") + assert result is None + + def test_normal_user_gets_path(self, user_manager, mock_request, mock_user_directory): + """Test normal user gets valid filepath.""" + mock_request.headers = {"comfy-user": "default"} + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + path = user_manager.get_request_user_filepath(mock_request, "test.txt") + assert path is not None + assert "default" in path + assert path.endswith("test.txt") + + +class TestAddUser: + """Tests for add_user() - 3rd defense layer (creation-time blocking). + + Verifies: + - System User name (__ prefix) creation is rejected with ValueError + - Sanitized usernames that become System User are also rejected + """ + + def test_system_user_prefix_name_raises(self, user_manager): + """Test System User prefix in name raises ValueError.""" + with pytest.raises(ValueError, match="System User prefix not allowed"): + user_manager.add_user("__system") + + def test_system_user_prefix_cache_raises(self, user_manager): + """Test System User cache prefix raises ValueError.""" + with pytest.raises(ValueError, match="System User prefix not allowed"): + user_manager.add_user("__cache") + + def test_sanitized_system_user_prefix_raises(self, user_manager): + """Test sanitized name becoming System User prefix raises ValueError (bypass prevention).""" + # "__test" directly starts with System User prefix + with pytest.raises(ValueError, match="System User prefix not allowed"): + user_manager.add_user("__test") + + def test_normal_user_creation(self, user_manager, mock_user_directory): + """Test normal user creation works.""" + user_id = user_manager.add_user("Normal User") + assert user_id is not None + assert not user_id.startswith("__") + assert "Normal-User" in user_id or "Normal_User" in user_id + + def test_empty_name_raises(self, user_manager): + """Test empty name raises ValueError.""" + with pytest.raises(ValueError, match="username not provided"): + user_manager.add_user("") + + def test_whitespace_only_raises(self, user_manager): + """Test whitespace-only name raises ValueError.""" + with pytest.raises(ValueError, match="username not provided"): + user_manager.add_user(" ") + + +class TestDefenseLayers: + """Integration tests for all three defense layers. + + Verifies: + - Each defense layer blocks System Users independently + - System User bypass is impossible through any layer + """ + + def test_layer1_get_request_user_id(self, user_manager, mock_request): + """Test 1st defense layer blocks System Users.""" + mock_request.headers = {"comfy-user": "__system"} + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + with pytest.raises(KeyError): + user_manager.get_request_user_id(mock_request) + + def test_layer2_get_public_user_directory(self): + """Test 2nd defense layer blocks System Users.""" + result = folder_paths.get_public_user_directory("__system") + assert result is None + + def test_layer3_add_user(self, user_manager): + """Test 3rd defense layer blocks System User creation.""" + with pytest.raises(ValueError): + user_manager.add_user("__system") diff --git a/tests-unit/folder_paths_test/system_user_test.py b/tests-unit/folder_paths_test/system_user_test.py new file mode 100644 index 000000000..cd46459f1 --- /dev/null +++ b/tests-unit/folder_paths_test/system_user_test.py @@ -0,0 +1,206 @@ +"""Tests for System User Protection in folder_paths.py + +Tests cover: +- get_system_user_directory(): Internal API for custom nodes to access System User directories +- get_public_user_directory(): HTTP endpoint access with System User blocking +- Backward compatibility: Existing APIs unchanged +- Security: Path traversal and injection prevention +""" + +import pytest +import os +import tempfile + +from folder_paths import ( + get_system_user_directory, + get_public_user_directory, + get_user_directory, + set_user_directory, +) + + +@pytest.fixture(scope="module") +def mock_user_directory(): + """Create a temporary user directory for testing.""" + with tempfile.TemporaryDirectory() as temp_dir: + original_dir = get_user_directory() + set_user_directory(temp_dir) + yield temp_dir + set_user_directory(original_dir) + + +class TestGetSystemUserDirectory: + """Tests for get_system_user_directory() - internal API for System User directories. + + Verifies: + - Custom nodes can access System User directories via internal API + - Input validation prevents path traversal attacks + """ + + def test_default_name(self, mock_user_directory): + """Test default 'system' name.""" + path = get_system_user_directory() + assert path.endswith("__system") + assert mock_user_directory in path + + def test_custom_name(self, mock_user_directory): + """Test custom system user name.""" + path = get_system_user_directory("cache") + assert path.endswith("__cache") + assert "__cache" in path + + def test_name_with_underscore(self, mock_user_directory): + """Test name with underscore in middle.""" + path = get_system_user_directory("my_cache") + assert "__my_cache" in path + + def test_empty_name_raises(self): + """Test empty name raises ValueError.""" + with pytest.raises(ValueError, match="cannot be empty"): + get_system_user_directory("") + + def test_none_name_raises(self): + """Test None name raises ValueError.""" + with pytest.raises(ValueError, match="cannot be empty"): + get_system_user_directory(None) + + def test_name_starting_with_underscore_raises(self): + """Test name starting with underscore raises ValueError.""" + with pytest.raises(ValueError, match="should not start with underscore"): + get_system_user_directory("_system") + + def test_path_traversal_raises(self): + """Test path traversal attempt raises ValueError (security).""" + with pytest.raises(ValueError, match="Invalid system user name"): + get_system_user_directory("../escape") + + def test_path_traversal_middle_raises(self): + """Test path traversal in middle raises ValueError (security).""" + with pytest.raises(ValueError, match="Invalid system user name"): + get_system_user_directory("system/../other") + + def test_special_chars_raise(self): + """Test special characters raise ValueError (security).""" + with pytest.raises(ValueError, match="Invalid system user name"): + get_system_user_directory("system!") + + def test_returns_absolute_path(self, mock_user_directory): + """Test returned path is absolute.""" + path = get_system_user_directory("test") + assert os.path.isabs(path) + + +class TestGetPublicUserDirectory: + """Tests for get_public_user_directory() - HTTP endpoint access with System User blocking. + + Verifies: + - System Users (__ prefix) return None, blocking HTTP access + - Public Users get valid paths + - New endpoints using this function are automatically protected + """ + + def test_normal_user(self, mock_user_directory): + """Test normal user returns valid path.""" + path = get_public_user_directory("default") + assert path is not None + assert "default" in path + assert mock_user_directory in path + + def test_system_user_returns_none(self): + """Test System User (__ prefix) returns None - blocks HTTP access.""" + assert get_public_user_directory("__system") is None + + def test_system_user_cache_returns_none(self): + """Test System User cache returns None.""" + assert get_public_user_directory("__cache") is None + + def test_empty_user_returns_none(self): + """Test empty user returns None.""" + assert get_public_user_directory("") is None + + def test_none_user_returns_none(self): + """Test None user returns None.""" + assert get_public_user_directory(None) is None + + def test_header_injection_returns_none(self): + """Test header injection attempt returns None (security).""" + assert get_public_user_directory("__system\r\nX-Injected: true") is None + + def test_null_byte_injection_returns_none(self): + """Test null byte injection handling (security).""" + # Note: startswith check happens before any path operations + result = get_public_user_directory("user\x00__system") + # This should return a path since it doesn't start with __ + # The actual security comes from the path not being __* + assert result is not None or result is None # Depends on validation + + def test_path_traversal_attempt(self, mock_user_directory): + """Test path traversal attempt handling.""" + # This function doesn't validate paths, only reserved prefix + # Path traversal should be handled by the caller + path = get_public_user_directory("../../../etc/passwd") + # Returns path but doesn't start with __, so not None + # Actual path validation happens in user_manager + assert path is not None or "__" not in "../../../etc/passwd" + + def test_returns_absolute_path(self, mock_user_directory): + """Test returned path is absolute.""" + path = get_public_user_directory("testuser") + assert path is not None + assert os.path.isabs(path) + + +class TestBackwardCompatibility: + """Tests for backward compatibility with existing APIs. + + Verifies: + - get_user_directory() API unchanged + - Existing user data remains accessible + """ + + def test_get_user_directory_unchanged(self, mock_user_directory): + """Test get_user_directory() still works as before.""" + user_dir = get_user_directory() + assert user_dir is not None + assert os.path.isabs(user_dir) + assert user_dir == mock_user_directory + + def test_existing_user_accessible(self, mock_user_directory): + """Test existing users can access their directories.""" + path = get_public_user_directory("default") + assert path is not None + assert "default" in path + + +class TestEdgeCases: + """Tests for edge cases in System User detection. + + Verifies: + - Only __ prefix is blocked (not _, not middle __) + - Bypass attempts are prevented + """ + + def test_prefix_only(self): + """Test prefix-only string is blocked.""" + assert get_public_user_directory("__") is None + + def test_single_underscore_allowed(self): + """Test single underscore prefix is allowed (not System User).""" + path = get_public_user_directory("_system") + assert path is not None + assert "_system" in path + + def test_triple_underscore_blocked(self): + """Test triple underscore is blocked (starts with __).""" + assert get_public_user_directory("___system") is None + + def test_underscore_in_middle_allowed(self): + """Test underscore in middle is allowed.""" + path = get_public_user_directory("my__system") + assert path is not None + assert "my__system" in path + + def test_leading_space_allowed(self): + """Test leading space + prefix is allowed (doesn't start with __).""" + path = get_public_user_directory(" __system") + assert path is not None diff --git a/tests-unit/prompt_server_test/system_user_endpoint_test.py b/tests-unit/prompt_server_test/system_user_endpoint_test.py new file mode 100644 index 000000000..22ac00af9 --- /dev/null +++ b/tests-unit/prompt_server_test/system_user_endpoint_test.py @@ -0,0 +1,375 @@ +"""E2E Tests for System User Protection HTTP Endpoints + +Tests cover: +- HTTP endpoint blocking: System Users cannot access /userdata (GET, POST, DELETE, move) +- User creation blocking: System User names cannot be created via POST /users +- Backward compatibility: Public Users work as before +- Custom node scenario: Internal API works while HTTP is blocked +- Structural security: get_public_user_directory() provides automatic protection +""" + +import pytest +import os +from aiohttp import web +from app.user_manager import UserManager +from unittest.mock import patch +import folder_paths + + +@pytest.fixture +def mock_user_directory(tmp_path): + """Create a temporary user directory.""" + original_dir = folder_paths.get_user_directory() + folder_paths.set_user_directory(str(tmp_path)) + yield tmp_path + folder_paths.set_user_directory(original_dir) + + +@pytest.fixture +def user_manager_multi_user(mock_user_directory): + """Create UserManager in multi-user mode.""" + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + um = UserManager() + # Add test users + um.users = {"default": "default", "test_user_123": "Test User"} + yield um + + +@pytest.fixture +def app_multi_user(user_manager_multi_user): + """Create app with multi-user mode enabled.""" + app = web.Application() + routes = web.RouteTableDef() + user_manager_multi_user.add_routes(routes) + app.add_routes(routes) + return app + + +class TestSystemUserEndpointBlocking: + """E2E tests for System User blocking on all HTTP endpoints. + + Verifies: + - GET /userdata blocked for System Users + - POST /userdata blocked for System Users + - DELETE /userdata blocked for System Users + - POST /userdata/.../move/... blocked for System Users + """ + + @pytest.mark.asyncio + async def test_userdata_get_blocks_system_user( + self, aiohttp_client, app_multi_user, mock_user_directory + ): + """ + GET /userdata with System User header should be blocked. + """ + # Create test directory for System User (simulating internal creation) + system_user_dir = mock_user_directory / "__system" + system_user_dir.mkdir() + (system_user_dir / "secret.txt").write_text("sensitive data") + + client = await aiohttp_client(app_multi_user) + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + # Attempt to access System User's data via HTTP + resp = await client.get( + "/userdata?dir=.", + headers={"comfy-user": "__system"} + ) + + # Should be blocked (403 Forbidden or similar error) + assert resp.status in [400, 403, 500], \ + f"System User access should be blocked, got {resp.status}" + + @pytest.mark.asyncio + async def test_userdata_post_blocks_system_user( + self, aiohttp_client, app_multi_user, mock_user_directory + ): + """ + POST /userdata with System User header should be blocked. + """ + client = await aiohttp_client(app_multi_user) + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + resp = await client.post( + "/userdata/test.txt", + headers={"comfy-user": "__system"}, + data=b"malicious content" + ) + + assert resp.status in [400, 403, 500], \ + f"System User write should be blocked, got {resp.status}" + + # Verify no file was created + assert not (mock_user_directory / "__system" / "test.txt").exists() + + @pytest.mark.asyncio + async def test_userdata_delete_blocks_system_user( + self, aiohttp_client, app_multi_user, mock_user_directory + ): + """ + DELETE /userdata with System User header should be blocked. + """ + # Create a file in System User directory + system_user_dir = mock_user_directory / "__system" + system_user_dir.mkdir() + secret_file = system_user_dir / "secret.txt" + secret_file.write_text("do not delete") + + client = await aiohttp_client(app_multi_user) + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + resp = await client.delete( + "/userdata/secret.txt", + headers={"comfy-user": "__system"} + ) + + assert resp.status in [400, 403, 500], \ + f"System User delete should be blocked, got {resp.status}" + + # Verify file still exists + assert secret_file.exists() + + @pytest.mark.asyncio + async def test_v2_userdata_blocks_system_user( + self, aiohttp_client, app_multi_user, mock_user_directory + ): + """ + GET /v2/userdata with System User header should be blocked. + """ + client = await aiohttp_client(app_multi_user) + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + resp = await client.get( + "/v2/userdata", + headers={"comfy-user": "__system"} + ) + + assert resp.status in [400, 403, 500], \ + f"System User v2 access should be blocked, got {resp.status}" + + @pytest.mark.asyncio + async def test_move_userdata_blocks_system_user( + self, aiohttp_client, app_multi_user, mock_user_directory + ): + """ + POST /userdata/{file}/move/{dest} with System User header should be blocked. + """ + system_user_dir = mock_user_directory / "__system" + system_user_dir.mkdir() + (system_user_dir / "source.txt").write_text("sensitive data") + + client = await aiohttp_client(app_multi_user) + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + resp = await client.post( + "/userdata/source.txt/move/dest.txt", + headers={"comfy-user": "__system"} + ) + + assert resp.status in [400, 403, 500], \ + f"System User move should be blocked, got {resp.status}" + + # Verify source file still exists (move was blocked) + assert (system_user_dir / "source.txt").exists() + + +class TestSystemUserCreationBlocking: + """E2E tests for blocking System User name creation via POST /users. + + Verifies: + - POST /users returns 400 for System User name (not 500) + """ + + @pytest.mark.asyncio + async def test_post_users_blocks_system_user_name( + self, aiohttp_client, app_multi_user + ): + """POST /users with System User name should return 400 Bad Request.""" + client = await aiohttp_client(app_multi_user) + + resp = await client.post( + "/users", + json={"username": "__system"} + ) + + assert resp.status == 400, \ + f"System User creation should return 400, got {resp.status}" + + @pytest.mark.asyncio + async def test_post_users_blocks_system_user_prefix_variations( + self, aiohttp_client, app_multi_user + ): + """POST /users with any System User prefix variation should return 400 Bad Request.""" + client = await aiohttp_client(app_multi_user) + + system_user_names = ["__system", "__cache", "__config", "__anything"] + + for name in system_user_names: + resp = await client.post("/users", json={"username": name}) + assert resp.status == 400, \ + f"System User name '{name}' should return 400, got {resp.status}" + + +class TestPublicUserStillWorks: + """E2E tests for backward compatibility - Public Users should work as before. + + Verifies: + - Public Users can access their data via HTTP + - Public Users can create files via HTTP + """ + + @pytest.mark.asyncio + async def test_public_user_can_access_userdata( + self, aiohttp_client, app_multi_user, mock_user_directory + ): + """ + Public Users should still be able to access their data. + """ + # Create test directory for Public User + user_dir = mock_user_directory / "default" + user_dir.mkdir() + test_dir = user_dir / "workflows" + test_dir.mkdir() + (test_dir / "test.json").write_text('{"test": true}') + + client = await aiohttp_client(app_multi_user) + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + resp = await client.get( + "/userdata?dir=workflows", + headers={"comfy-user": "default"} + ) + + assert resp.status == 200 + data = await resp.json() + assert "test.json" in data + + @pytest.mark.asyncio + async def test_public_user_can_create_files( + self, aiohttp_client, app_multi_user, mock_user_directory + ): + """ + Public Users should still be able to create files. + """ + # Create user directory + user_dir = mock_user_directory / "default" + user_dir.mkdir() + + client = await aiohttp_client(app_multi_user) + + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + resp = await client.post( + "/userdata/newfile.txt", + headers={"comfy-user": "default"}, + data=b"user content" + ) + + assert resp.status == 200 + assert (user_dir / "newfile.txt").exists() + + +class TestCustomNodeScenario: + """Tests for custom node use case: internal API access vs HTTP blocking. + + Verifies: + - Internal API (get_system_user_directory) works for custom nodes + - HTTP endpoint cannot access data created via internal API + """ + + def test_internal_api_can_access_system_user(self, mock_user_directory): + """ + Internal API (get_system_user_directory) should work for custom nodes. + """ + # Custom node uses internal API + system_path = folder_paths.get_system_user_directory("mynode_config") + + assert system_path is not None + assert "__mynode_config" in system_path + + # Can create and write to System User directory + os.makedirs(system_path, exist_ok=True) + config_file = os.path.join(system_path, "settings.json") + with open(config_file, "w") as f: + f.write('{"api_key": "secret"}') + + assert os.path.exists(config_file) + + @pytest.mark.asyncio + async def test_http_cannot_access_internal_data( + self, aiohttp_client, app_multi_user, mock_user_directory + ): + """ + HTTP endpoint cannot access data created via internal API. + """ + # Custom node creates data via internal API + system_path = folder_paths.get_system_user_directory("mynode_config") + os.makedirs(system_path, exist_ok=True) + with open(os.path.join(system_path, "secret.json"), "w") as f: + f.write('{"api_key": "secret"}') + + client = await aiohttp_client(app_multi_user) + + # Attacker tries to access via HTTP + with patch('app.user_manager.args') as mock_args: + mock_args.multi_user = True + resp = await client.get( + "/userdata/secret.json", + headers={"comfy-user": "__mynode_config"} + ) + + # Should be blocked + assert resp.status in [400, 403, 500] + + +class TestStructuralSecurity: + """Tests for structural security pattern. + + Verifies: + - get_public_user_directory() automatically blocks System Users + - New endpoints using this function are automatically protected + """ + + def test_get_public_user_directory_blocks_system_user(self): + """ + Any code using get_public_user_directory() is automatically protected. + """ + # This is the structural security - any new endpoint using this function + # will automatically block System Users + assert folder_paths.get_public_user_directory("__system") is None + assert folder_paths.get_public_user_directory("__cache") is None + assert folder_paths.get_public_user_directory("__anything") is None + + # Public Users work + assert folder_paths.get_public_user_directory("default") is not None + assert folder_paths.get_public_user_directory("user123") is not None + + def test_structural_security_pattern(self, mock_user_directory): + """ + Demonstrate the structural security pattern for new endpoints. + + Any new endpoint should follow this pattern: + 1. Get user from request + 2. Use get_public_user_directory() - automatically blocks System Users + 3. If None, return error + """ + def new_endpoint_handler(user_id: str) -> str | None: + """Example of how new endpoints should be implemented.""" + user_path = folder_paths.get_public_user_directory(user_id) + if user_path is None: + return None # Blocked + return user_path + + # System Users are automatically blocked + assert new_endpoint_handler("__system") is None + assert new_endpoint_handler("__secret") is None + + # Public Users work + assert new_endpoint_handler("default") is not None From 5151cff293607c2191981fd16c62c1b1a6939695 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 28 Nov 2025 20:55:00 -0800 Subject: [PATCH 654/759] Add some missing z image lora layers. (#10980) --- comfy/lora.py | 9 +++++---- comfy/utils.py | 51 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/comfy/lora.py b/comfy/lora.py index 360cd128f..3a9077869 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -316,10 +316,11 @@ def model_lora_keys_unet(model, key_map={}): if isinstance(model, comfy.model_base.Lumina2): diffusers_keys = comfy.utils.z_image_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.") for k in diffusers_keys: - to = diffusers_keys[k] - key_lora = k[:-len(".weight")] - key_map["diffusion_model.{}".format(key_lora)] = to - key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = to + if k.endswith(".weight"): + to = diffusers_keys[k] + key_lora = k[:-len(".weight")] + key_map["diffusion_model.{}".format(key_lora)] = to + key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = to return key_map diff --git a/comfy/utils.py b/comfy/utils.py index 21bd6e8cf..37485e497 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -678,17 +678,14 @@ def flux_to_diffusers(mmdit_config, output_prefix=""): def z_image_to_diffusers(mmdit_config, output_prefix=""): n_layers = mmdit_config.get("n_layers", 0) hidden_size = mmdit_config.get("dim", 0) - + n_context_refiner = mmdit_config.get("n_refiner_layers", 2) + n_noise_refiner = mmdit_config.get("n_refiner_layers", 2) key_map = {} - for index in range(n_layers): - prefix_from = "layers.{}".format(index) - prefix_to = "{}layers.{}".format(output_prefix, index) - + def add_block_keys(prefix_from, prefix_to, has_adaln=True): for end in ("weight", "bias"): k = "{}.attention.".format(prefix_from) qkv = "{}.attention.qkv.{}".format(prefix_to, end) - key_map["{}to_q.{}".format(k, end)] = (qkv, (0, 0, hidden_size)) key_map["{}to_k.{}".format(k, end)] = (qkv, (0, hidden_size, hidden_size)) key_map["{}to_v.{}".format(k, end)] = (qkv, (0, hidden_size * 2, hidden_size)) @@ -698,28 +695,52 @@ def z_image_to_diffusers(mmdit_config, output_prefix=""): "attention.norm_k.weight": "attention.k_norm.weight", "attention.to_out.0.weight": "attention.out.weight", "attention.to_out.0.bias": "attention.out.bias", + "attention_norm1.weight": "attention_norm1.weight", + "attention_norm2.weight": "attention_norm2.weight", + "feed_forward.w1.weight": "feed_forward.w1.weight", + "feed_forward.w2.weight": "feed_forward.w2.weight", + "feed_forward.w3.weight": "feed_forward.w3.weight", + "ffn_norm1.weight": "ffn_norm1.weight", + "ffn_norm2.weight": "ffn_norm2.weight", } + if has_adaln: + block_map["adaLN_modulation.0.weight"] = "adaLN_modulation.0.weight" + block_map["adaLN_modulation.0.bias"] = "adaLN_modulation.0.bias" + for k, v in block_map.items(): + key_map["{}.{}".format(prefix_from, k)] = "{}.{}".format(prefix_to, v) - for k in block_map: - key_map["{}.{}".format(prefix_from, k)] = "{}.{}".format(prefix_to, block_map[k]) + for i in range(n_layers): + add_block_keys("layers.{}".format(i), "{}layers.{}".format(output_prefix, i)) - MAP_BASIC = { - # Final layer + for i in range(n_context_refiner): + add_block_keys("context_refiner.{}".format(i), "{}context_refiner.{}".format(output_prefix, i)) + + for i in range(n_noise_refiner): + add_block_keys("noise_refiner.{}".format(i), "{}noise_refiner.{}".format(output_prefix, i)) + + MAP_BASIC = [ ("final_layer.linear.weight", "all_final_layer.2-1.linear.weight"), ("final_layer.linear.bias", "all_final_layer.2-1.linear.bias"), ("final_layer.adaLN_modulation.1.weight", "all_final_layer.2-1.adaLN_modulation.1.weight"), ("final_layer.adaLN_modulation.1.bias", "all_final_layer.2-1.adaLN_modulation.1.bias"), - # X embedder ("x_embedder.weight", "all_x_embedder.2-1.weight"), ("x_embedder.bias", "all_x_embedder.2-1.bias"), - } + ("x_pad_token", "x_pad_token"), + ("cap_embedder.0.weight", "cap_embedder.0.weight"), + ("cap_embedder.1.weight", "cap_embedder.1.weight"), + ("cap_embedder.1.bias", "cap_embedder.1.bias"), + ("cap_pad_token", "cap_pad_token"), + ("t_embedder.mlp.0.weight", "t_embedder.mlp.0.weight"), + ("t_embedder.mlp.0.bias", "t_embedder.mlp.0.bias"), + ("t_embedder.mlp.2.weight", "t_embedder.mlp.2.weight"), + ("t_embedder.mlp.2.bias", "t_embedder.mlp.2.bias"), + ] - for k in MAP_BASIC: - key_map[k[1]] = "{}{}".format(output_prefix, k[0]) + for c, diffusers in MAP_BASIC: + key_map[diffusers] = "{}{}".format(output_prefix, c) return key_map - def repeat_to_batch_size(tensor, batch_size, dim=0): if tensor.shape[dim] > batch_size: return tensor.narrow(dim, 0, batch_size) From 0a6746898d6864d65e2fc7504e5e875f8c19c0ba Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 29 Nov 2025 15:00:55 -0800 Subject: [PATCH 655/759] Make the ScaleRope node work on Z Image and Lumina. (#10994) --- comfy/ldm/lumina/model.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index 565400b54..7d7e9112c 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -517,11 +517,23 @@ class NextDiT(nn.Module): B, C, H, W = x.shape x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2)) + rope_options = transformer_options.get("rope_options", None) + h_scale = 1.0 + w_scale = 1.0 + h_start = 0 + w_start = 0 + if rope_options is not None: + h_scale = rope_options.get("scale_y", 1.0) + w_scale = rope_options.get("scale_x", 1.0) + + h_start = rope_options.get("shift_y", 0.0) + w_start = rope_options.get("shift_x", 0.0) + H_tokens, W_tokens = H // pH, W // pW x_pos_ids = torch.zeros((bsz, x.shape[1], 3), dtype=torch.float32, device=device) x_pos_ids[:, :, 0] = cap_feats.shape[1] + 1 - x_pos_ids[:, :, 1] = torch.arange(H_tokens, dtype=torch.float32, device=device).view(-1, 1).repeat(1, W_tokens).flatten() - x_pos_ids[:, :, 2] = torch.arange(W_tokens, dtype=torch.float32, device=device).view(1, -1).repeat(H_tokens, 1).flatten() + x_pos_ids[:, :, 1] = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten() + x_pos_ids[:, :, 2] = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten() if self.pad_tokens_multiple is not None: pad_extra = (-x.shape[1]) % self.pad_tokens_multiple From 4967f81778f84b41acc40ed03536dd71dd88e5f2 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sun, 30 Nov 2025 10:07:26 +0800 Subject: [PATCH 656/759] update template to 0.7.25 (#10996) * update template to 0.7.24 * Update template to 0.7.25 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e0b2c566b..386477808 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.32.9 -comfyui-workflow-templates==0.7.23 +comfyui-workflow-templates==0.7.25 comfyui-embedded-docs==0.3.1 torch torchsde From f8b981ae9a5676311624bbafa636a1874db79459 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 30 Nov 2025 01:21:31 -0800 Subject: [PATCH 657/759] Next AMD portable will have pytorch with ROCm 7.1.1 (#11002) --- .github/workflows/release-stable-all.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release-stable-all.yml b/.github/workflows/release-stable-all.yml index 9274b4170..d72ece2ce 100644 --- a/.github/workflows/release-stable-all.yml +++ b/.github/workflows/release-stable-all.yml @@ -65,11 +65,11 @@ jobs: contents: "write" packages: "write" pull-requests: "read" - name: "Release AMD ROCm 6.4.4" + name: "Release AMD ROCm 7.1.1" uses: ./.github/workflows/stable-release.yml with: git_tag: ${{ inputs.git_tag }} - cache_tag: "rocm644" + cache_tag: "rocm711" python_minor: "12" python_patch: "10" rel_name: "amd" From 7dbd5dfe91f057b83dcba0c127f712f6d71f7def Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Mon, 1 Dec 2025 10:27:17 -0800 Subject: [PATCH 658/759] bump comfyui-frontend-package to 1.32.10 (#11018) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 386477808..045b2ac54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.32.9 +comfyui-frontend-package==1.32.10 comfyui-workflow-templates==0.7.25 comfyui-embedded-docs==0.3.1 torch From 2640acb31ccfddee57ba22d5245bf456e8dffe53 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 1 Dec 2025 14:13:48 -0800 Subject: [PATCH 659/759] Update qwen tokenizer to add qwen 3 tokens. (#11029) Doesn't actually change anything for current workflows because none of the current models have a template with the think tokens. --- .../qwen25_tokenizer/tokenizer_config.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/comfy/text_encoders/qwen25_tokenizer/tokenizer_config.json b/comfy/text_encoders/qwen25_tokenizer/tokenizer_config.json index 67688e82c..df5b5d7fe 100644 --- a/comfy/text_encoders/qwen25_tokenizer/tokenizer_config.json +++ b/comfy/text_encoders/qwen25_tokenizer/tokenizer_config.json @@ -179,36 +179,36 @@ "special": false }, "151665": { - "content": "<|img|>", + "content": "", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, - "special": true + "special": false }, "151666": { - "content": "<|endofimg|>", + "content": "", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, - "special": true + "special": false }, "151667": { - "content": "<|meta|>", + "content": "", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, - "special": true + "special": false }, "151668": { - "content": "<|endofmeta|>", + "content": "", "lstrip": false, "normalized": false, "rstrip": false, "single_word": false, - "special": true + "special": false } }, "additional_special_tokens": [ From 1cb7e22a95701f2619d1ddf5683ea221b58a0c13 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 2 Dec 2025 02:11:52 +0200 Subject: [PATCH 660/759] [API Nodes] add Kling O1 model support (#11025) * feat(api-nodes): add Kling O1 model support * fix: increase max allowed duration to 10.05 seconds * fix(VideoInput): respect "format" argument --- comfy_api/latest/_input_impl/video_types.py | 5 +- comfy_api_nodes/apis/kling_api.py | 66 +++ comfy_api_nodes/nodes_kling.py | 444 +++++++++++++++++++- comfy_api_nodes/util/upload_helpers.py | 3 +- 4 files changed, 499 insertions(+), 19 deletions(-) create mode 100644 comfy_api_nodes/apis/kling_api.py diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py index bde37f90a..7231bf13c 100644 --- a/comfy_api/latest/_input_impl/video_types.py +++ b/comfy_api/latest/_input_impl/video_types.py @@ -336,7 +336,10 @@ class VideoFromComponents(VideoInput): raise ValueError("Only MP4 format is supported for now") if codec != VideoCodec.AUTO and codec != VideoCodec.H264: raise ValueError("Only H264 codec is supported for now") - with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}) as output: + extra_kwargs = {} + if format != VideoContainer.AUTO: + extra_kwargs["format"] = format.value + with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}, **extra_kwargs) as output: # Add metadata before writing any streams if metadata is not None: for key, value in metadata.items(): diff --git a/comfy_api_nodes/apis/kling_api.py b/comfy_api_nodes/apis/kling_api.py new file mode 100644 index 000000000..0a3b447c5 --- /dev/null +++ b/comfy_api_nodes/apis/kling_api.py @@ -0,0 +1,66 @@ +from pydantic import BaseModel, Field + + +class OmniProText2VideoRequest(BaseModel): + model_name: str = Field(..., description="kling-video-o1") + aspect_ratio: str = Field(..., description="'16:9', '9:16' or '1:1'") + duration: str = Field(..., description="'5' or '10'") + prompt: str = Field(...) + mode: str = Field("pro") + + +class OmniParamImage(BaseModel): + image_url: str = Field(...) + type: str | None = Field(None, description="Can be 'first_frame' or 'end_frame'") + + +class OmniParamVideo(BaseModel): + video_url: str = Field(...) + refer_type: str | None = Field(..., description="Can be 'base' or 'feature'") + keep_original_sound: str = Field(..., description="'yes' or 'no'") + + +class OmniProFirstLastFrameRequest(BaseModel): + model_name: str = Field(..., description="kling-video-o1") + image_list: list[OmniParamImage] = Field(..., min_length=1, max_length=7) + duration: str = Field(..., description="'5' or '10'") + prompt: str = Field(...) + mode: str = Field("pro") + + +class OmniProReferences2VideoRequest(BaseModel): + model_name: str = Field(..., description="kling-video-o1") + aspect_ratio: str | None = Field(..., description="'16:9', '9:16' or '1:1'") + image_list: list[OmniParamImage] | None = Field( + None, max_length=7, description="Max length 4 when video is present." + ) + video_list: list[OmniParamVideo] | None = Field(None, max_length=1) + duration: str | None = Field(..., description="From 3 to 10.") + prompt: str = Field(...) + mode: str = Field("pro") + + +class TaskStatusVideoResult(BaseModel): + duration: str | None = Field(None, description="Total video duration") + id: str | None = Field(None, description="Generated video ID") + url: str | None = Field(None, description="URL for generated video") + + +class TaskStatusVideoResults(BaseModel): + videos: list[TaskStatusVideoResult] | None = Field(None) + + +class TaskStatusVideoResponseData(BaseModel): + created_at: int | None = Field(None, description="Task creation time") + updated_at: int | None = Field(None, description="Task update time") + task_status: str | None = None + task_status_msg: str | None = Field(None, description="Additional failure reason. Only for polling endpoint.") + task_id: str | None = Field(None, description="Task ID") + task_result: TaskStatusVideoResults | None = Field(None) + + +class TaskStatusVideoResponse(BaseModel): + code: int | None = Field(None, description="Error code") + message: str | None = Field(None, description="Error message") + request_id: str | None = Field(None, description="Request ID") + data: TaskStatusVideoResponseData | None = Field(None) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 23a7f55f1..850c44db6 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -4,13 +4,13 @@ For source of truth on the allowed permutations of request fields, please refere - [Compatibility Table](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap) """ -import math import logging - -from typing_extensions import override +import math import torch +from typing_extensions import override +from comfy_api.latest import IO, ComfyExtension, Input, InputImpl from comfy_api_nodes.apis import ( KlingCameraControl, KlingCameraConfig, @@ -48,23 +48,31 @@ from comfy_api_nodes.apis import ( KlingCharacterEffectModelName, KlingSingleImageEffectModelName, ) +from comfy_api_nodes.apis.kling_api import ( + OmniParamImage, + OmniParamVideo, + OmniProFirstLastFrameRequest, + OmniProReferences2VideoRequest, + OmniProText2VideoRequest, + TaskStatusVideoResponse, +) from comfy_api_nodes.util import ( - validate_image_dimensions, + ApiEndpoint, + download_url_to_image_tensor, + download_url_to_video_output, + get_number_of_images, + poll_op, + sync_op, + tensor_to_base64_string, + upload_audio_to_comfyapi, + upload_images_to_comfyapi, + upload_video_to_comfyapi, validate_image_aspect_ratio, + validate_image_dimensions, + validate_string, validate_video_dimensions, validate_video_duration, - tensor_to_base64_string, - validate_string, - upload_audio_to_comfyapi, - download_url_to_image_tensor, - upload_video_to_comfyapi, - download_url_to_video_output, - sync_op, - ApiEndpoint, - poll_op, ) -from comfy_api.input_impl import VideoFromFile -from comfy_api.latest import ComfyExtension, IO, Input KLING_API_VERSION = "v1" PATH_TEXT_TO_VIDEO = f"/proxy/kling/{KLING_API_VERSION}/videos/text2video" @@ -202,6 +210,20 @@ VOICES_CONFIG = { } +async def finish_omni_video_task(cls: type[IO.ComfyNode], response: TaskStatusVideoResponse) -> IO.NodeOutput: + if response.code: + raise RuntimeError( + f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}" + ) + final_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/kling/v1/videos/omni-video/{response.data.task_id}"), + response_model=TaskStatusVideoResponse, + status_extractor=lambda r: (r.data.task_status if r.data else None), + ) + return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url)) + + def is_valid_camera_control_configs(configs: list[float]) -> bool: """Verifies that at least one camera control configuration is non-zero.""" return any(not math.isclose(value, 0.0) for value in configs) @@ -449,7 +471,7 @@ async def execute_video_effect( image_1: torch.Tensor, image_2: torch.Tensor | None = None, model_mode: KlingVideoGenMode | None = None, -) -> tuple[VideoFromFile, str, str]: +) -> tuple[InputImpl.VideoFromFile, str, str]: if dual_character: request_input_field = KlingDualCharacterEffectInput( model_name=model_name, @@ -736,6 +758,386 @@ class KlingTextToVideoNode(IO.ComfyNode): ) +class OmniProTextToVideoNode(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingOmniProTextToVideoNode", + display_name="Kling Omni Text to Video (Pro)", + category="api node/video/Kling", + description="Use text prompts to generate videos with the latest Kling model.", + inputs=[ + IO.Combo.Input("model_name", options=["kling-video-o1"]), + IO.String.Input( + "prompt", + multiline=True, + tooltip="A text prompt describing the video content. " + "This can include both positive and negative descriptions.", + ), + IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]), + IO.Combo.Input("duration", options=[5, 10]), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model_name: str, + prompt: str, + aspect_ratio: str, + duration: int, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=2500) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), + response_model=TaskStatusVideoResponse, + data=OmniProText2VideoRequest( + model_name=model_name, + prompt=prompt, + aspect_ratio=aspect_ratio, + duration=str(duration), + ), + ) + return await finish_omni_video_task(cls, response) + + +class OmniProFirstLastFrameNode(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingOmniProFirstLastFrameNode", + display_name="Kling Omni First-Last-Frame to Video (Pro)", + category="api node/video/Kling", + description="Use a start frame, an optional end frame, or reference images with the latest Kling model.", + inputs=[ + IO.Combo.Input("model_name", options=["kling-video-o1"]), + IO.String.Input( + "prompt", + multiline=True, + tooltip="A text prompt describing the video content. " + "This can include both positive and negative descriptions.", + ), + IO.Combo.Input("duration", options=["5", "10"]), + IO.Image.Input("first_frame"), + IO.Image.Input( + "end_frame", + optional=True, + tooltip="An optional end frame for the video. " + "This cannot be used simultaneously with 'reference_images'.", + ), + IO.Image.Input( + "reference_images", + optional=True, + tooltip="Up to 6 additional reference images.", + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model_name: str, + prompt: str, + duration: int, + first_frame: Input.Image, + end_frame: Input.Image | None = None, + reference_images: Input.Image | None = None, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=2500) + if end_frame is not None and reference_images is not None: + raise ValueError("The 'end_frame' input cannot be used simultaneously with 'reference_images'.") + validate_image_dimensions(first_frame, min_width=300, min_height=300) + validate_image_aspect_ratio(first_frame, (1, 2.5), (2.5, 1)) + image_list: list[OmniParamImage] = [ + OmniParamImage( + image_url=(await upload_images_to_comfyapi(cls, first_frame, wait_label="Uploading first frame"))[0], + type="first_frame", + ) + ] + if end_frame is not None: + validate_image_dimensions(end_frame, min_width=300, min_height=300) + validate_image_aspect_ratio(end_frame, (1, 2.5), (2.5, 1)) + image_list.append( + OmniParamImage( + image_url=(await upload_images_to_comfyapi(cls, end_frame, wait_label="Uploading end frame"))[0], + type="end_frame", + ) + ) + if reference_images is not None: + if get_number_of_images(reference_images) > 6: + raise ValueError("The maximum number of reference images allowed is 6.") + for i in reference_images: + validate_image_dimensions(i, min_width=300, min_height=300) + validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1)) + for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference frame(s)"): + image_list.append(OmniParamImage(image_url=i)) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), + response_model=TaskStatusVideoResponse, + data=OmniProFirstLastFrameRequest( + model_name=model_name, + prompt=prompt, + duration=str(duration), + image_list=image_list, + ), + ) + return await finish_omni_video_task(cls, response) + + +class OmniProImageToVideoNode(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingOmniProImageToVideoNode", + display_name="Kling Omni Image to Video (Pro)", + category="api node/video/Kling", + description="Use up to 7 reference images to generate a video with the latest Kling model.", + inputs=[ + IO.Combo.Input("model_name", options=["kling-video-o1"]), + IO.String.Input( + "prompt", + multiline=True, + tooltip="A text prompt describing the video content. " + "This can include both positive and negative descriptions.", + ), + IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]), + IO.Int.Input("duration", default=3, min=3, max=10, display_mode=IO.NumberDisplay.slider), + IO.Image.Input( + "reference_images", + tooltip="Up to 7 reference images.", + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model_name: str, + prompt: str, + aspect_ratio: str, + duration: int, + reference_images: Input.Image, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=2500) + if get_number_of_images(reference_images) > 7: + raise ValueError("The maximum number of reference images is 7.") + for i in reference_images: + validate_image_dimensions(i, min_width=300, min_height=300) + validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1)) + image_list: list[OmniParamImage] = [] + for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"): + image_list.append(OmniParamImage(image_url=i)) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), + response_model=TaskStatusVideoResponse, + data=OmniProReferences2VideoRequest( + model_name=model_name, + prompt=prompt, + aspect_ratio=aspect_ratio, + duration=str(duration), + image_list=image_list, + ), + ) + return await finish_omni_video_task(cls, response) + + +class OmniProVideoToVideoNode(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingOmniProVideoToVideoNode", + display_name="Kling Omni Video to Video (Pro)", + category="api node/video/Kling", + description="Use a video and up to 4 reference images to generate a video with the latest Kling model.", + inputs=[ + IO.Combo.Input("model_name", options=["kling-video-o1"]), + IO.String.Input( + "prompt", + multiline=True, + tooltip="A text prompt describing the video content. " + "This can include both positive and negative descriptions.", + ), + IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]), + IO.Int.Input("duration", default=3, min=3, max=10, display_mode=IO.NumberDisplay.slider), + IO.Video.Input("reference_video", tooltip="Video to use as a reference."), + IO.Boolean.Input("keep_original_sound", default=True), + IO.Image.Input( + "reference_images", + tooltip="Up to 4 additional reference images.", + optional=True, + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model_name: str, + prompt: str, + aspect_ratio: str, + duration: int, + reference_video: Input.Video, + keep_original_sound: bool, + reference_images: Input.Image | None = None, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=2500) + validate_video_duration(reference_video, min_duration=3.0, max_duration=10.05) + validate_video_dimensions(reference_video, min_width=720, min_height=720, max_width=2160, max_height=2160) + image_list: list[OmniParamImage] = [] + if reference_images is not None: + if get_number_of_images(reference_images) > 4: + raise ValueError("The maximum number of reference images allowed with a video input is 4.") + for i in reference_images: + validate_image_dimensions(i, min_width=300, min_height=300) + validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1)) + for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"): + image_list.append(OmniParamImage(image_url=i)) + video_list = [ + OmniParamVideo( + video_url=await upload_video_to_comfyapi(cls, reference_video, wait_label="Uploading reference video"), + refer_type="feature", + keep_original_sound="yes" if keep_original_sound else "no", + ) + ] + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), + response_model=TaskStatusVideoResponse, + data=OmniProReferences2VideoRequest( + model_name=model_name, + prompt=prompt, + aspect_ratio=aspect_ratio, + duration=str(duration), + image_list=image_list if image_list else None, + video_list=video_list, + ), + ) + return await finish_omni_video_task(cls, response) + + +class OmniProEditVideoNode(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingOmniProEditVideoNode", + display_name="Kling Omni Edit Video (Pro)", + category="api node/video/Kling", + description="Edit an existing video with the latest model from Kling.", + inputs=[ + IO.Combo.Input("model_name", options=["kling-video-o1"]), + IO.String.Input( + "prompt", + multiline=True, + tooltip="A text prompt describing the video content. " + "This can include both positive and negative descriptions.", + ), + IO.Video.Input("video", tooltip="Video for editing. The output video length will be the same."), + IO.Boolean.Input("keep_original_sound", default=True), + IO.Image.Input( + "reference_images", + tooltip="Up to 4 additional reference images.", + optional=True, + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model_name: str, + prompt: str, + video: Input.Video, + keep_original_sound: bool, + reference_images: Input.Image | None = None, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=2500) + validate_video_duration(video, min_duration=3.0, max_duration=10.05) + validate_video_dimensions(video, min_width=720, min_height=720, max_width=2160, max_height=2160) + image_list: list[OmniParamImage] = [] + if reference_images is not None: + if get_number_of_images(reference_images) > 4: + raise ValueError("The maximum number of reference images allowed with a video input is 4.") + for i in reference_images: + validate_image_dimensions(i, min_width=300, min_height=300) + validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1)) + for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"): + image_list.append(OmniParamImage(image_url=i)) + video_list = [ + OmniParamVideo( + video_url=await upload_video_to_comfyapi(cls, video, wait_label="Uploading base video"), + refer_type="base", + keep_original_sound="yes" if keep_original_sound else "no", + ) + ] + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), + response_model=TaskStatusVideoResponse, + data=OmniProReferences2VideoRequest( + model_name=model_name, + prompt=prompt, + aspect_ratio=None, + duration=None, + image_list=image_list if image_list else None, + video_list=video_list, + ), + ) + return await finish_omni_video_task(cls, response) + + class KlingCameraControlT2VNode(IO.ComfyNode): """ Kling Text to Video Camera Control Node. This node is a text to video node, but it supports controlling the camera. @@ -1162,7 +1564,10 @@ class KlingSingleImageVideoEffectNode(IO.ComfyNode): category="api node/video/Kling", description="Achieve different special effects when generating a video based on the effect_scene.", inputs=[ - IO.Image.Input("image", tooltip=" Reference Image. URL or Base64 encoded string (without data:image prefix). File size cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1"), + IO.Image.Input( + "image", + tooltip=" Reference Image. URL or Base64 encoded string (without data:image prefix). File size cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1", + ), IO.Combo.Input( "effect_scene", options=[i.value for i in KlingSingleImageEffectsScene], @@ -1525,6 +1930,11 @@ class KlingExtension(ComfyExtension): KlingImageGenerationNode, KlingSingleImageVideoEffectNode, KlingDualCharacterVideoEffectNode, + OmniProTextToVideoNode, + OmniProFirstLastFrameNode, + OmniProImageToVideoNode, + OmniProVideoToVideoNode, + OmniProEditVideoNode, ] diff --git a/comfy_api_nodes/util/upload_helpers.py b/comfy_api_nodes/util/upload_helpers.py index b9019841f..0532bea9a 100644 --- a/comfy_api_nodes/util/upload_helpers.py +++ b/comfy_api_nodes/util/upload_helpers.py @@ -103,6 +103,7 @@ async def upload_video_to_comfyapi( container: VideoContainer = VideoContainer.MP4, codec: VideoCodec = VideoCodec.H264, max_duration: Optional[int] = None, + wait_label: str | None = "Uploading", ) -> str: """ Uploads a single video to ComfyUI API and returns its download URL. @@ -127,7 +128,7 @@ async def upload_video_to_comfyapi( video.save_to(video_bytes_io, format=container, codec=codec) video_bytes_io.seek(0) - return await upload_file_to_comfyapi(cls, video_bytes_io, filename, upload_mime_type) + return await upload_file_to_comfyapi(cls, video_bytes_io, filename, upload_mime_type, wait_label) async def upload_file_to_comfyapi( From 30c259cac8c08ff8d015f9aff3151cb525c9b702 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Mon, 1 Dec 2025 20:25:35 -0500 Subject: [PATCH 661/759] ComfyUI version v0.3.76 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index fa4b4f4b0..4b039356e 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.75" +__version__ = "0.3.76" diff --git a/pyproject.toml b/pyproject.toml index 9009e65fe..02b94a0ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.75" +version = "0.3.76" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 878db3a727c1c6049bc1c4959cdfabc35eaf3d56 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 1 Dec 2025 17:56:17 -0800 Subject: [PATCH 662/759] Implement the Ovis image model. (#11030) --- comfy/ldm/chroma/model.py | 3 +- comfy/ldm/flux/layers.py | 68 +++++++++++++++++++++-------------- comfy/ldm/flux/model.py | 21 ++++++++--- comfy/model_detection.py | 10 +++++- comfy/sd.py | 13 +++++-- comfy/text_encoders/llama.py | 31 ++++++++++++++++ comfy/text_encoders/ovis.py | 69 ++++++++++++++++++++++++++++++++++++ nodes.py | 2 +- 8 files changed, 182 insertions(+), 35 deletions(-) create mode 100644 comfy/text_encoders/ovis.py diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py index a72f8cc47..2e8ef0687 100644 --- a/comfy/ldm/chroma/model.py +++ b/comfy/ldm/chroma/model.py @@ -40,7 +40,8 @@ class ChromaParams: out_dim: int hidden_dim: int n_layers: int - + txt_ids_dims: list + vec_in_dim: int diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py index 2472ab79c..60f2bdae2 100644 --- a/comfy/ldm/flux/layers.py +++ b/comfy/ldm/flux/layers.py @@ -57,6 +57,35 @@ class MLPEmbedder(nn.Module): def forward(self, x: Tensor) -> Tensor: return self.out_layer(self.silu(self.in_layer(x))) +class YakMLP(nn.Module): + def __init__(self, hidden_size: int, intermediate_size: int, dtype=None, device=None, operations=None): + super().__init__() + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.gate_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=True, dtype=dtype, device=device) + self.up_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=True, dtype=dtype, device=device) + self.down_proj = operations.Linear(self.intermediate_size, self.hidden_size, bias=True, dtype=dtype, device=device) + self.act_fn = nn.SiLU() + + def forward(self, x: Tensor) -> Tensor: + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + +def build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=False, yak_mlp=False, dtype=None, device=None, operations=None): + if yak_mlp: + return YakMLP(hidden_size, mlp_hidden_dim, dtype=dtype, device=device, operations=operations) + if mlp_silu_act: + return nn.Sequential( + operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device), + SiLUActivation(), + operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device), + ) + else: + return nn.Sequential( + operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), + nn.GELU(approximate="tanh"), + operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), + ) class RMSNorm(torch.nn.Module): def __init__(self, dim: int, dtype=None, device=None, operations=None): @@ -140,7 +169,7 @@ class SiLUActivation(nn.Module): class DoubleStreamBlock(nn.Module): - def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, dtype=None, device=None, operations=None): + def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, yak_mlp=False, dtype=None, device=None, operations=None): super().__init__() mlp_hidden_dim = int(hidden_size * mlp_ratio) @@ -156,18 +185,7 @@ class DoubleStreamBlock(nn.Module): self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - if mlp_silu_act: - self.img_mlp = nn.Sequential( - operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device), - SiLUActivation(), - operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device), - ) - else: - self.img_mlp = nn.Sequential( - operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), - nn.GELU(approximate="tanh"), - operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), - ) + self.img_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations) if self.modulation: self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations) @@ -177,18 +195,7 @@ class DoubleStreamBlock(nn.Module): self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - if mlp_silu_act: - self.txt_mlp = nn.Sequential( - operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device), - SiLUActivation(), - operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device), - ) - else: - self.txt_mlp = nn.Sequential( - operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), - nn.GELU(approximate="tanh"), - operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), - ) + self.txt_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations) self.flipped_img_txt = flipped_img_txt @@ -275,6 +282,7 @@ class SingleStreamBlock(nn.Module): modulation=True, mlp_silu_act=False, bias=True, + yak_mlp=False, dtype=None, device=None, operations=None @@ -288,12 +296,17 @@ class SingleStreamBlock(nn.Module): self.mlp_hidden_dim = int(hidden_size * mlp_ratio) self.mlp_hidden_dim_first = self.mlp_hidden_dim + self.yak_mlp = yak_mlp if mlp_silu_act: self.mlp_hidden_dim_first = int(hidden_size * mlp_ratio * 2) self.mlp_act = SiLUActivation() else: self.mlp_act = nn.GELU(approximate="tanh") + if self.yak_mlp: + self.mlp_hidden_dim_first *= 2 + self.mlp_act = nn.SiLU() + # qkv and mlp_in self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim_first, bias=bias, dtype=dtype, device=device) # proj and mlp_out @@ -325,7 +338,10 @@ class SingleStreamBlock(nn.Module): attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) del q, k, v # compute activation in mlp stream, cat again and run second linear layer - mlp = self.mlp_act(mlp) + if self.yak_mlp: + mlp = self.mlp_act(mlp[..., self.mlp_hidden_dim_first // 2:]) * mlp[..., :self.mlp_hidden_dim_first // 2] + else: + mlp = self.mlp_act(mlp) output = self.linear2(torch.cat((attn, mlp), 2)) x += apply_mod(output, mod.gate, None, modulation_dims) if x.dtype == torch.float16: diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index d5674dea6..f40c2a7a9 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -15,7 +15,8 @@ from .layers import ( MLPEmbedder, SingleStreamBlock, timestep_embedding, - Modulation + Modulation, + RMSNorm ) @dataclass @@ -34,11 +35,14 @@ class FluxParams: patch_size: int qkv_bias: bool guidance_embed: bool + txt_ids_dims: list global_modulation: bool = False mlp_silu_act: bool = False ops_bias: bool = True default_ref_method: str = "offset" ref_index_scale: float = 1.0 + yak_mlp: bool = False + txt_norm: bool = False class Flux(nn.Module): @@ -76,6 +80,11 @@ class Flux(nn.Module): ) self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device) + if params.txt_norm: + self.txt_norm = RMSNorm(params.context_in_dim, dtype=dtype, device=device, operations=operations) + else: + self.txt_norm = None + self.double_blocks = nn.ModuleList( [ DoubleStreamBlock( @@ -86,6 +95,7 @@ class Flux(nn.Module): modulation=params.global_modulation is False, mlp_silu_act=params.mlp_silu_act, proj_bias=params.ops_bias, + yak_mlp=params.yak_mlp, dtype=dtype, device=device, operations=operations ) for _ in range(params.depth) @@ -94,7 +104,7 @@ class Flux(nn.Module): self.single_blocks = nn.ModuleList( [ - SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=params.global_modulation is False, mlp_silu_act=params.mlp_silu_act, bias=params.ops_bias, dtype=dtype, device=device, operations=operations) + SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=params.global_modulation is False, mlp_silu_act=params.mlp_silu_act, bias=params.ops_bias, yak_mlp=params.yak_mlp, dtype=dtype, device=device, operations=operations) for _ in range(params.depth_single_blocks) ] ) @@ -150,6 +160,8 @@ class Flux(nn.Module): y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype) vec = vec + self.vector_in(y[:, :self.params.vec_in_dim]) + if self.txt_norm is not None: + txt = self.txt_norm(txt) txt = self.txt_in(txt) vec_orig = vec @@ -332,8 +344,9 @@ class Flux(nn.Module): txt_ids = torch.zeros((bs, context.shape[1], len(self.params.axes_dim)), device=x.device, dtype=torch.float32) - if len(self.params.axes_dim) == 4: # Flux 2 - txt_ids[:, :, 3] = torch.linspace(0, context.shape[1] - 1, steps=context.shape[1], device=x.device, dtype=torch.float32) + if len(self.params.txt_ids_dims) > 0: + for i in self.params.txt_ids_dims: + txt_ids[:, :, i] = torch.linspace(0, context.shape[1] - 1, steps=context.shape[1], device=x.device, dtype=torch.float32) out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None)) out = out[:, :img_tokens] diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 7afe4a798..7d0517e61 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -208,12 +208,12 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["theta"] = 2000 dit_config["out_channels"] = 128 dit_config["global_modulation"] = True - dit_config["vec_in_dim"] = None dit_config["mlp_silu_act"] = True dit_config["qkv_bias"] = False dit_config["ops_bias"] = False dit_config["default_ref_method"] = "index" dit_config["ref_index_scale"] = 10.0 + dit_config["txt_ids_dims"] = [3] patch_size = 1 else: dit_config["image_model"] = "flux" @@ -223,6 +223,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["theta"] = 10000 dit_config["out_channels"] = 16 dit_config["qkv_bias"] = True + dit_config["txt_ids_dims"] = [] patch_size = 2 dit_config["in_channels"] = 16 @@ -245,6 +246,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): vec_in_key = '{}vector_in.in_layer.weight'.format(key_prefix) if vec_in_key in state_dict_keys: dit_config["vec_in_dim"] = state_dict[vec_in_key].shape[1] + else: + dit_config["vec_in_dim"] = None dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.') dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.') @@ -270,6 +273,11 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["nerf_embedder_dtype"] = torch.float32 else: dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys + dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys + dit_config["txt_norm"] = "{}txt_norm.scale".format(key_prefix) in state_dict_keys + if dit_config["yak_mlp"] and dit_config["txt_norm"]: # Ovis model + dit_config["txt_ids_dims"] = [1, 2] + return dit_config if '{}t5_yproj.weight'.format(key_prefix) in state_dict_keys: #Genmo mochi preview diff --git a/comfy/sd.py b/comfy/sd.py index 9eeb0c45a..f9e5efab5 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -53,6 +53,7 @@ import comfy.text_encoders.omnigen2 import comfy.text_encoders.qwen_image import comfy.text_encoders.hunyuan_image import comfy.text_encoders.z_image +import comfy.text_encoders.ovis import comfy.model_patcher import comfy.lora @@ -956,6 +957,7 @@ class CLIPType(Enum): QWEN_IMAGE = 18 HUNYUAN_IMAGE = 19 HUNYUAN_VIDEO_15 = 20 + OVIS = 21 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}): @@ -987,6 +989,7 @@ class TEModel(Enum): MISTRAL3_24B = 14 MISTRAL3_24B_PRUNED_FLUX2 = 15 QWEN3_4B = 16 + QWEN3_2B = 17 def detect_te_model(sd): @@ -1020,9 +1023,12 @@ def detect_te_model(sd): if weight.shape[0] == 512: return TEModel.QWEN25_7B if "model.layers.0.post_attention_layernorm.weight" in sd: - if 'model.layers.0.self_attn.q_norm.weight' in sd: - return TEModel.QWEN3_4B weight = sd['model.layers.0.post_attention_layernorm.weight'] + if 'model.layers.0.self_attn.q_norm.weight' in sd: + if weight.shape[0] == 2560: + return TEModel.QWEN3_4B + elif weight.shape[0] == 2048: + return TEModel.QWEN3_2B if weight.shape[0] == 5120: if "model.layers.39.post_attention_layernorm.weight" in sd: return TEModel.MISTRAL3_24B @@ -1150,6 +1156,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip elif te_model == TEModel.QWEN3_4B: clip_target.clip = comfy.text_encoders.z_image.te(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.z_image.ZImageTokenizer + elif te_model == TEModel.QWEN3_2B: + clip_target.clip = comfy.text_encoders.ovis.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.ovis.OvisTokenizer else: # clip_l if clip_type == CLIPType.SD3: diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index cd4b5f76c..0d07ac8c6 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -100,6 +100,28 @@ class Qwen3_4BConfig: rope_scale = None final_norm: bool = True +@dataclass +class Ovis25_2BConfig: + vocab_size: int = 151936 + hidden_size: int = 2048 + intermediate_size: int = 6144 + num_hidden_layers: int = 28 + num_attention_heads: int = 16 + num_key_value_heads: int = 8 + max_position_embeddings: int = 40960 + rms_norm_eps: float = 1e-6 + rope_theta: float = 1000000.0 + transformer_type: str = "llama" + head_dim = 128 + rms_norm_add = False + mlp_activation = "silu" + qkv_bias = False + rope_dims = None + q_norm = "gemma3" + k_norm = "gemma3" + rope_scale = None + final_norm: bool = True + @dataclass class Qwen25_7BVLI_Config: vocab_size: int = 152064 @@ -542,6 +564,15 @@ class Qwen3_4B(BaseLlama, torch.nn.Module): self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) self.dtype = dtype +class Ovis25_2B(BaseLlama, torch.nn.Module): + def __init__(self, config_dict, dtype, device, operations): + super().__init__() + config = Ovis25_2BConfig(**config_dict) + self.num_layers = config.num_hidden_layers + + self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) + self.dtype = dtype + class Qwen25_7BVLI(BaseLlama, torch.nn.Module): def __init__(self, config_dict, dtype, device, operations): super().__init__() diff --git a/comfy/text_encoders/ovis.py b/comfy/text_encoders/ovis.py new file mode 100644 index 000000000..81c9bd51c --- /dev/null +++ b/comfy/text_encoders/ovis.py @@ -0,0 +1,69 @@ +from transformers import Qwen2Tokenizer +import comfy.text_encoders.llama +from comfy import sd1_clip +import os +import torch +import numbers + +class Qwen3Tokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer") + super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='qwen3_2b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=284, pad_token=151643, tokenizer_data=tokenizer_data) + + +class OvisTokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_2b", tokenizer=Qwen3Tokenizer) + self.llama_template = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background: {}<|im_end|>\n<|im_start|>assistant\n\n\n\n\n" + + def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs): + if llama_template is None: + llama_text = self.llama_template.format(text) + else: + llama_text = llama_template.format(text) + + tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs) + return tokens + +class Ovis25_2BModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}): + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Ovis25_2B, enable_attention_masks=attention_mask, return_attention_masks=False, zero_out_masked=True, model_options=model_options) + + +class OvisTEModel(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + super().__init__(device=device, dtype=dtype, name="qwen3_2b", clip_model=Ovis25_2BModel, model_options=model_options) + + def encode_token_weights(self, token_weight_pairs, template_end=-1): + out, pooled = super().encode_token_weights(token_weight_pairs) + tok_pairs = token_weight_pairs["qwen3_2b"][0] + count_im_start = 0 + if template_end == -1: + for i, v in enumerate(tok_pairs): + elem = v[0] + if not torch.is_tensor(elem): + if isinstance(elem, numbers.Integral): + if elem == 4004 and count_im_start < 1: + template_end = i + count_im_start += 1 + + if out.shape[1] > (template_end + 1): + if tok_pairs[template_end + 1][0] == 25: + template_end += 1 + + out = out[:, template_end:] + return out, pooled, {} + + +def te(dtype_llama=None, llama_scaled_fp8=None, llama_quantization_metadata=None): + class OvisTEModel_(OvisTEModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: + model_options = model_options.copy() + model_options["scaled_fp8"] = llama_scaled_fp8 + if dtype_llama is not None: + dtype = dtype_llama + if llama_quantization_metadata is not None: + model_options["quantization_metadata"] = llama_quantization_metadata + super().__init__(device=device, dtype=dtype, model_options=model_options) + return OvisTEModel_ diff --git a/nodes.py b/nodes.py index 495dec806..d5e5dc228 100644 --- a/nodes.py +++ b/nodes.py @@ -939,7 +939,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), From c55dc857d5da5af203caf720ed7056047d382544 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Mon, 1 Dec 2025 17:56:38 -0800 Subject: [PATCH 663/759] bump comfyui-frontend-package to 1.33.10 (#11028) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 045b2ac54..f98848e20 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.32.10 +comfyui-frontend-package==1.33.10 comfyui-workflow-templates==0.7.25 comfyui-embedded-docs==0.3.1 torch From b4a20acc54b0b94dc05a1bd09dc0b54dd12203f1 Mon Sep 17 00:00:00 2001 From: "Dr.Lt.Data" <128333288+ltdrdata@users.noreply.github.com> Date: Tue, 2 Dec 2025 12:32:52 +0900 Subject: [PATCH 664/759] feat: Support ComfyUI-Manager for pip version (#7555) --- comfy/cli_args.py | 7 +++++++ comfy_api/feature_flags.py | 1 + main.py | 30 ++++++++++++++++++++++++++++++ manager_requirements.txt | 1 + nodes.py | 9 +++++++++ server.py | 8 +++++++- 6 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 manager_requirements.txt diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 5f0dfaa10..209fc185b 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -121,6 +121,12 @@ upcast.add_argument("--force-upcast-attention", action="store_true", help="Force upcast.add_argument("--dont-upcast-attention", action="store_true", help="Disable all upcasting of attention. Should be unnecessary except for debugging.") +parser.add_argument("--enable-manager", action="store_true", help="Enable the ComfyUI-Manager feature.") +manager_group = parser.add_mutually_exclusive_group() +manager_group.add_argument("--disable-manager-ui", action="store_true", help="Disables only the ComfyUI-Manager UI and endpoints. Scheduled installations and similar background tasks will still operate.") +manager_group.add_argument("--enable-manager-legacy-ui", action="store_true", help="Enables the legacy UI of ComfyUI-Manager") + + vram_group = parser.add_mutually_exclusive_group() vram_group.add_argument("--gpu-only", action="store_true", help="Store and run everything (text encoders/CLIP models, etc... on the GPU).") vram_group.add_argument("--highvram", action="store_true", help="By default models will be unloaded to CPU memory after being used. This option keeps them in GPU memory.") @@ -168,6 +174,7 @@ parser.add_argument("--multi-user", action="store_true", help="Enables per-user parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level') parser.add_argument("--log-stdout", action="store_true", help="Send normal process output to stdout instead of stderr (default).") + # The default built-in provider hosted under web/ DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest" diff --git a/comfy_api/feature_flags.py b/comfy_api/feature_flags.py index 0d4389a6e..bfb77eb5f 100644 --- a/comfy_api/feature_flags.py +++ b/comfy_api/feature_flags.py @@ -13,6 +13,7 @@ from comfy.cli_args import args SERVER_FEATURE_FLAGS: Dict[str, Any] = { "supports_preview_metadata": True, "max_upload_size": args.max_upload_size * 1024 * 1024, # Convert MB to bytes + "extension": {"manager": {"supports_v4": True}}, } diff --git a/main.py b/main.py index e1b0f1620..0cd815d9e 100644 --- a/main.py +++ b/main.py @@ -15,6 +15,7 @@ from comfy_execution.progress import get_progress_state from comfy_execution.utils import get_executing_context from comfy_api import feature_flags + if __name__ == "__main__": #NOTE: These do not do anything on core ComfyUI, they are for custom nodes. os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1' @@ -22,6 +23,23 @@ if __name__ == "__main__": setup_logger(log_level=args.verbose, use_stdout=args.log_stdout) + +def handle_comfyui_manager_unavailable(): + if not args.windows_standalone_build: + logging.warning(f"\n\nYou appear to be running comfyui-manager from source, this is not recommended. Please install comfyui-manager using the following command:\ncommand:\n\t{sys.executable} -m pip install --pre comfyui_manager\n") + args.enable_manager = False + + +if args.enable_manager: + if importlib.util.find_spec("comfyui_manager"): + import comfyui_manager + + if not comfyui_manager.__file__ or not comfyui_manager.__file__.endswith('__init__.py'): + handle_comfyui_manager_unavailable() + else: + handle_comfyui_manager_unavailable() + + def apply_custom_paths(): # extra model paths extra_model_paths_config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extra_model_paths.yaml") @@ -79,6 +97,11 @@ def execute_prestartup_script(): for possible_module in possible_modules: module_path = os.path.join(custom_node_path, possible_module) + + if args.enable_manager: + if comfyui_manager.should_be_disabled(module_path): + continue + if os.path.isfile(module_path) or module_path.endswith(".disabled") or module_path == "__pycache__": continue @@ -101,6 +124,10 @@ def execute_prestartup_script(): logging.info("") apply_custom_paths() + +if args.enable_manager: + comfyui_manager.prestartup() + execute_prestartup_script() @@ -323,6 +350,9 @@ def start_comfyui(asyncio_loop=None): asyncio.set_event_loop(asyncio_loop) prompt_server = server.PromptServer(asyncio_loop) + if args.enable_manager and not args.disable_manager_ui: + comfyui_manager.start() + hook_breaker_ac10a0.save_functions() asyncio_loop.run_until_complete(nodes.init_extra_nodes( init_custom_nodes=(not args.disable_all_custom_nodes) or len(args.whitelist_custom_nodes) > 0, diff --git a/manager_requirements.txt b/manager_requirements.txt new file mode 100644 index 000000000..52cc5389c --- /dev/null +++ b/manager_requirements.txt @@ -0,0 +1 @@ +comfyui_manager==4.0.3b3 diff --git a/nodes.py b/nodes.py index d5e5dc228..4c910a34b 100644 --- a/nodes.py +++ b/nodes.py @@ -43,6 +43,9 @@ import folder_paths import latent_preview import node_helpers +if args.enable_manager: + import comfyui_manager + def before_node_execution(): comfy.model_management.throw_exception_if_processing_interrupted() @@ -2243,6 +2246,12 @@ async def init_external_custom_nodes(): if args.disable_all_custom_nodes and possible_module not in args.whitelist_custom_nodes: logging.info(f"Skipping {possible_module} due to disable_all_custom_nodes and whitelist_custom_nodes") continue + + if args.enable_manager: + if comfyui_manager.should_be_disabled(module_path): + logging.info(f"Blocked by policy: {module_path}") + continue + time_before = time.perf_counter() success = await load_custom_node(module_path, base_node_names, module_parent="custom_nodes") node_import_times.append((time.perf_counter() - time_before, module_path, success)) diff --git a/server.py b/server.py index fca5050bd..e3bd056d9 100644 --- a/server.py +++ b/server.py @@ -44,6 +44,9 @@ from protocol import BinaryEventTypes # Import cache control middleware from middleware.cache_middleware import cache_control +if args.enable_manager: + import comfyui_manager + async def send_socket_catch_exception(function, message): try: await function(message) @@ -212,6 +215,9 @@ class PromptServer(): if args.disable_api_nodes: middlewares.append(create_block_external_middleware()) + if args.enable_manager: + middlewares.append(comfyui_manager.create_middleware()) + max_upload_size = round(args.max_upload_size * 1024 * 1024) self.app = web.Application(client_max_size=max_upload_size, middlewares=middlewares) self.sockets = dict() @@ -599,7 +605,7 @@ class PromptServer(): system_stats = { "system": { - "os": os.name, + "os": sys.platform, "ram_total": ram_total, "ram_free": ram_free, "comfyui_version": __version__, From a17cf1c3871ad582c85c2bb6fddb63ec9c6df0ce Mon Sep 17 00:00:00 2001 From: Yoland Yan <4950057+yoland68@users.noreply.github.com> Date: Mon, 1 Dec 2025 19:40:44 -0800 Subject: [PATCH 665/759] Add @guill as a code owner (#11031) --- CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/CODEOWNERS b/CODEOWNERS index b7aca9b26..51acc4986 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,3 +1,4 @@ # Admins * @comfyanonymous * @kosinkadink +* @guill From 44baa0b7f32dd0c2ff0a9898aeb6c7929d855cd3 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Tue, 2 Dec 2025 11:46:29 -0800 Subject: [PATCH 666/759] Fix CODEOWNERS formatting to have all on the same line, otherwise only last line applies (#11053) --- CODEOWNERS | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index 51acc4986..4d5448636 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,4 +1,2 @@ # Admins -* @comfyanonymous -* @kosinkadink -* @guill +* @comfyanonymous @kosinkadink @guill From 33d6aec3b70bc6f3e5bba26c85bd8f3bb1380d08 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 2 Dec 2025 21:50:13 +0200 Subject: [PATCH 667/759] add check for the format arg type in VideoFromComponents.save_to function (#11046) * add check for the format var type in VideoFromComponents.save_to function * convert "format" to VideoContainer enum --- comfy_api/latest/_input_impl/video_types.py | 2 +- comfy_extras/nodes_video.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py index 7231bf13c..a4cd3737d 100644 --- a/comfy_api/latest/_input_impl/video_types.py +++ b/comfy_api/latest/_input_impl/video_types.py @@ -337,7 +337,7 @@ class VideoFromComponents(VideoInput): if codec != VideoCodec.AUTO and codec != VideoCodec.H264: raise ValueError("Only H264 codec is supported for now") extra_kwargs = {} - if format != VideoContainer.AUTO: + if isinstance(format, VideoContainer) and format != VideoContainer.AUTO: extra_kwargs["format"] = format.value with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}, **extra_kwargs) as output: # Add metadata before writing any streams diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index 69fabb12e..6cf6e39bf 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -88,7 +88,7 @@ class SaveVideo(io.ComfyNode): ) @classmethod - def execute(cls, video: VideoInput, filename_prefix, format, codec) -> io.NodeOutput: + def execute(cls, video: VideoInput, filename_prefix, format: str, codec) -> io.NodeOutput: width, height = video.get_dimensions() full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( filename_prefix, @@ -108,7 +108,7 @@ class SaveVideo(io.ComfyNode): file = f"{filename}_{counter:05}_.{VideoContainer.get_extension(format)}" video.save_to( os.path.join(full_output_folder, file), - format=format, + format=VideoContainer(format), codec=codec, metadata=saved_metadata ) From daaceac769a1355ab975758ede064317ea7514b4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 2 Dec 2025 14:11:58 -0800 Subject: [PATCH 668/759] Hack to make zimage work in fp16. (#11057) --- comfy/ldm/lumina/model.py | 18 +++++++++++------- comfy/supported_models.py | 2 ++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index 7d7e9112c..070b5da09 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -22,6 +22,10 @@ def modulate(x, scale): # Core NextDiT Model # ############################################################################# +def clamp_fp16(x): + if x.dtype == torch.float16: + return torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504) + return x class JointAttention(nn.Module): """Multi-head attention module.""" @@ -169,7 +173,7 @@ class FeedForward(nn.Module): # @torch.compile def _forward_silu_gating(self, x1, x3): - return F.silu(x1) * x3 + return clamp_fp16(F.silu(x1) * x3) def forward(self, x): return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x))) @@ -273,27 +277,27 @@ class JointTransformerBlock(nn.Module): scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1) x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2( - self.attention( + clamp_fp16(self.attention( modulate(self.attention_norm1(x), scale_msa), x_mask, freqs_cis, transformer_options=transformer_options, - ) + )) ) x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2( - self.feed_forward( + clamp_fp16(self.feed_forward( modulate(self.ffn_norm1(x), scale_mlp), - ) + )) ) else: assert adaln_input is None x = x + self.attention_norm2( - self.attention( + clamp_fp16(self.attention( self.attention_norm1(x), x_mask, freqs_cis, transformer_options=transformer_options, - ) + )) ) x = x + self.ffn_norm2( self.feed_forward( diff --git a/comfy/supported_models.py b/comfy/supported_models.py index af8120400..afd97160b 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1027,6 +1027,8 @@ class ZImage(Lumina2): memory_usage_factor = 1.7 + supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] + def clip_target(self, state_dict={}): pref = self.text_encoder_key_prefix[0] hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_4b.transformer.".format(pref)) From 277237ccc1499bac7fcd221a666dfe7a32ac4206 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Wed, 3 Dec 2025 08:24:19 +1000 Subject: [PATCH 669/759] attention: use flag based OOM fallback (#11038) Exception ref all local variables for the lifetime of exception context. Just set a flag and then if to dump the exception before falling back. --- comfy/ldm/modules/attention.py | 3 +++ comfy/ldm/modules/diffusionmodules/model.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index 7437e0567..a8800ded0 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -517,6 +517,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha @wrap_attn def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs): + exception_fallback = False if skip_reshape: b, _, _, dim_head = q.shape tensor_layout = "HND" @@ -541,6 +542,8 @@ def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape= out = sageattn(q, k, v, attn_mask=mask, is_causal=False, tensor_layout=tensor_layout) except Exception as e: logging.error("Error running sage attention: {}, using pytorch attention instead.".format(e)) + exception_fallback = True + if exception_fallback: if tensor_layout == "NHD": q, k, v = map( lambda t: t.transpose(1, 2), diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index 4245eedca..de1e01cc8 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -279,6 +279,7 @@ def pytorch_attention(q, k, v): orig_shape = q.shape B = orig_shape[0] C = orig_shape[1] + oom_fallback = False q, k, v = map( lambda t: t.view(B, 1, C, -1).transpose(2, 3).contiguous(), (q, k, v), @@ -289,6 +290,8 @@ def pytorch_attention(q, k, v): out = out.transpose(2, 3).reshape(orig_shape) except model_management.OOM_EXCEPTION: logging.warning("scaled_dot_product_attention OOMed: switched to slice attention") + oom_fallback = True + if oom_fallback: out = slice_attention(q.view(B, -1, C), k.view(B, -1, C).transpose(1, 2), v.view(B, -1, C).transpose(1, 2)).reshape(orig_shape) return out From b94d394a64dd0af06bca44b96c66549bb463331d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 2 Dec 2025 18:38:31 -0800 Subject: [PATCH 670/759] Support Z Image alibaba pai fun controlnets. (#11062) These are not actual controlnets so put it in the models/model_patches folder and use the ModelPatchLoader + QwenImageDiffsynthControlnet node to use it. --- comfy/ldm/lumina/controlnet.py | 113 ++++++++++++++++++++++++++++++ comfy/ldm/lumina/model.py | 24 ++++--- comfy_extras/nodes_model_patch.py | 101 +++++++++++++++++++++++++- 3 files changed, 229 insertions(+), 9 deletions(-) create mode 100644 comfy/ldm/lumina/controlnet.py diff --git a/comfy/ldm/lumina/controlnet.py b/comfy/ldm/lumina/controlnet.py new file mode 100644 index 000000000..fd7ce3b5c --- /dev/null +++ b/comfy/ldm/lumina/controlnet.py @@ -0,0 +1,113 @@ +import torch +from torch import nn + +from .model import JointTransformerBlock + +class ZImageControlTransformerBlock(JointTransformerBlock): + def __init__( + self, + layer_id: int, + dim: int, + n_heads: int, + n_kv_heads: int, + multiple_of: int, + ffn_dim_multiplier: float, + norm_eps: float, + qk_norm: bool, + modulation=True, + block_id=0, + operation_settings=None, + ): + super().__init__(layer_id, dim, n_heads, n_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, qk_norm, modulation, z_image_modulation=True, operation_settings=operation_settings) + self.block_id = block_id + if block_id == 0: + self.before_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.after_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + + def forward(self, c, x, **kwargs): + if self.block_id == 0: + c = self.before_proj(c) + x + c = super().forward(c, **kwargs) + c_skip = self.after_proj(c) + return c_skip, c + +class ZImage_Control(torch.nn.Module): + def __init__( + self, + dim: int = 3840, + n_heads: int = 30, + n_kv_heads: int = 30, + multiple_of: int = 256, + ffn_dim_multiplier: float = (8.0 / 3.0), + norm_eps: float = 1e-5, + qk_norm: bool = True, + dtype=None, + device=None, + operations=None, + **kwargs + ): + super().__init__() + operation_settings = {"operations": operations, "device": device, "dtype": dtype} + + self.additional_in_dim = 0 + self.control_in_dim = 16 + n_refiner_layers = 2 + self.n_control_layers = 6 + self.control_layers = nn.ModuleList( + [ + ZImageControlTransformerBlock( + i, + dim, + n_heads, + n_kv_heads, + multiple_of, + ffn_dim_multiplier, + norm_eps, + qk_norm, + block_id=i, + operation_settings=operation_settings, + ) + for i in range(self.n_control_layers) + ] + ) + + all_x_embedder = {} + patch_size = 2 + f_patch_size = 1 + x_embedder = operations.Linear(f_patch_size * patch_size * patch_size * self.control_in_dim, dim, bias=True, device=device, dtype=dtype) + all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder + + self.control_all_x_embedder = nn.ModuleDict(all_x_embedder) + self.control_noise_refiner = nn.ModuleList( + [ + JointTransformerBlock( + layer_id, + dim, + n_heads, + n_kv_heads, + multiple_of, + ffn_dim_multiplier, + norm_eps, + qk_norm, + modulation=True, + z_image_modulation=True, + operation_settings=operation_settings, + ) + for layer_id in range(n_refiner_layers) + ] + ) + + def forward(self, cap_feats, control_context, x_freqs_cis, adaln_input): + patch_size = 2 + f_patch_size = 1 + pH = pW = patch_size + B, C, H, W = control_context.shape + control_context = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_context.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2)) + + x_attn_mask = None + for layer in self.control_noise_refiner: + control_context = layer(control_context, x_attn_mask, x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input) + return control_context + + def forward_control_block(self, layer_id, control_context, x, x_attn_mask, x_freqs_cis, adaln_input): + return self.control_layers[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input) diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index 070b5da09..f1c1a0ec3 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -568,7 +568,7 @@ class NextDiT(nn.Module): ).execute(x, timesteps, context, num_tokens, attention_mask, **kwargs) # def forward(self, x, t, cap_feats, cap_mask): - def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs): + def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, transformer_options={}, **kwargs): t = 1.0 - timesteps cap_feats = context cap_mask = attention_mask @@ -585,16 +585,24 @@ class NextDiT(nn.Module): cap_feats = self.cap_embedder(cap_feats) # (N, L, D) # todo check if able to batchify w.o. redundant compute + patches = transformer_options.get("patches", {}) transformer_options = kwargs.get("transformer_options", {}) x_is_tensor = isinstance(x, torch.Tensor) - x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options) - freqs_cis = freqs_cis.to(x.device) + img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options) + freqs_cis = freqs_cis.to(img.device) - for layer in self.layers: - x = layer(x, mask, freqs_cis, adaln_input, transformer_options=transformer_options) + for i, layer in enumerate(self.layers): + img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options) + if "double_block" in patches: + for p in patches["double_block"]: + out = p({"img": img[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options}) + if "img" in out: + img[:, cap_size[0]:] = out["img"] + if "txt" in out: + img[:, :cap_size[0]] = out["txt"] - x = self.final_layer(x, adaln_input) - x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)[:,:,:h,:w] + img = self.final_layer(img, adaln_input) + img = self.unpatchify(img, img_size, cap_size, return_tensor=x_is_tensor)[:, :, :h, :w] - return -x + return -img diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index 783c59b6b..c61810dbf 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -6,6 +6,7 @@ import comfy.ops import comfy.model_management import comfy.ldm.common_dit import comfy.latent_formats +import comfy.ldm.lumina.controlnet class BlockWiseControlBlock(torch.nn.Module): @@ -189,6 +190,35 @@ class SigLIPMultiFeatProjModel(torch.nn.Module): return embedding +def z_image_convert(sd): + replace_keys = {".attention.to_out.0.bias": ".attention.out.bias", + ".attention.norm_k.weight": ".attention.k_norm.weight", + ".attention.norm_q.weight": ".attention.q_norm.weight", + ".attention.to_out.0.weight": ".attention.out.weight" + } + + out_sd = {} + for k in sorted(sd.keys()): + w = sd[k] + + k_out = k + if k_out.endswith(".attention.to_k.weight"): + cc = [w] + continue + if k_out.endswith(".attention.to_q.weight"): + cc = [w] + cc + continue + if k_out.endswith(".attention.to_v.weight"): + cc = cc + [w] + w = torch.cat(cc, dim=0) + k_out = k_out.replace(".attention.to_v.weight", ".attention.qkv.weight") + + for r, rr in replace_keys.items(): + k_out = k_out.replace(r, rr) + out_sd[k_out] = w + + return out_sd + class ModelPatchLoader: @classmethod def INPUT_TYPES(s): @@ -211,6 +241,9 @@ class ModelPatchLoader: elif 'feature_embedder.mid_layer_norm.bias' in sd: sd = comfy.utils.state_dict_prefix_replace(sd, {"feature_embedder.": ""}, filter_keys=True) model = SigLIPMultiFeatProjModel(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + elif 'control_all_x_embedder.2-1.weight' in sd: # alipai z image fun controlnet + sd = z_image_convert(sd) + model = comfy.ldm.lumina.controlnet.ZImage_Control(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) model.load_state_dict(sd) model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device()) @@ -263,6 +296,69 @@ class DiffSynthCnetPatch: def models(self): return [self.model_patch] +class ZImageControlPatch: + def __init__(self, model_patch, vae, image, strength): + self.model_patch = model_patch + self.vae = vae + self.image = image + self.strength = strength + self.encoded_image = self.encode_latent_cond(image) + self.encoded_image_size = (image.shape[1], image.shape[2]) + self.temp_data = None + + def encode_latent_cond(self, image): + latent_image = comfy.latent_formats.Flux().process_in(self.vae.encode(image)) + return latent_image + + def __call__(self, kwargs): + x = kwargs.get("x") + img = kwargs.get("img") + txt = kwargs.get("txt") + pe = kwargs.get("pe") + vec = kwargs.get("vec") + block_index = kwargs.get("block_index") + spacial_compression = self.vae.spacial_compression_encode() + if self.encoded_image is None or self.encoded_image_size != (x.shape[-2] * spacial_compression, x.shape[-1] * spacial_compression): + image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center") + loaded_models = comfy.model_management.loaded_models(only_currently_used=True) + self.encoded_image = self.encode_latent_cond(image_scaled.movedim(1, -1)) + self.encoded_image_size = (image_scaled.shape[-2], image_scaled.shape[-1]) + comfy.model_management.load_models_gpu(loaded_models) + + cnet_index = (block_index // 5) + cnet_index_float = (block_index / 5) + + kwargs.pop("img") # we do ops in place + kwargs.pop("txt") + + cnet_blocks = self.model_patch.model.n_control_layers + if cnet_index_float > (cnet_blocks - 1): + self.temp_data = None + return kwargs + + if self.temp_data is None or self.temp_data[0] > cnet_index: + self.temp_data = (-1, (None, self.model_patch.model(txt, self.encoded_image.to(img.dtype), pe, vec))) + + while self.temp_data[0] < cnet_index and (self.temp_data[0] + 1) < cnet_blocks: + next_layer = self.temp_data[0] + 1 + self.temp_data = (next_layer, self.model_patch.model.forward_control_block(next_layer, self.temp_data[1][1], img[:, :self.temp_data[1][1].shape[1]], None, pe, vec)) + + if cnet_index_float == self.temp_data[0]: + img[:, :self.temp_data[1][0].shape[1]] += (self.temp_data[1][0] * self.strength) + if cnet_blocks == self.temp_data[0] + 1: + self.temp_data = None + + return kwargs + + def to(self, device_or_dtype): + if isinstance(device_or_dtype, torch.device): + self.encoded_image = self.encoded_image.to(device_or_dtype) + self.temp_data = None + return self + + def models(self): + return [self.model_patch] + class QwenImageDiffsynthControlnet: @classmethod def INPUT_TYPES(s): @@ -289,7 +385,10 @@ class QwenImageDiffsynthControlnet: mask = mask.unsqueeze(2) mask = 1.0 - mask - model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength, mask)) + if isinstance(model_patch.model, comfy.ldm.lumina.controlnet.ZImage_Control): + model_patched.set_model_double_block_patch(ZImageControlPatch(model_patch, vae, image, strength)) + else: + model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength, mask)) return (model_patched,) From 3f512f5659cfbb3c53999cde6ff557591740252b Mon Sep 17 00:00:00 2001 From: Jim Heising Date: Tue, 2 Dec 2025 19:29:27 -0800 Subject: [PATCH 671/759] Added PATCH method to CORS headers (#11066) Added PATCH http method to access-control-allow-header-methods header because there are now PATCH endpoints exposed in the API. See https://github.com/comfyanonymous/ComfyUI/blob/277237ccc1499bac7fcd221a666dfe7a32ac4206/api_server/routes/internal/internal_routes.py#L34 for an example of an API endpoint that uses the PATCH method. --- server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server.py b/server.py index e3bd056d9..ac4f42222 100644 --- a/server.py +++ b/server.py @@ -98,7 +98,7 @@ def create_cors_middleware(allowed_origin: str): response = await handler(request) response.headers['Access-Control-Allow-Origin'] = allowed_origin - response.headers['Access-Control-Allow-Methods'] = 'POST, GET, DELETE, PUT, OPTIONS' + response.headers['Access-Control-Allow-Methods'] = 'POST, GET, DELETE, PUT, OPTIONS, PATCH' response.headers['Access-Control-Allow-Headers'] = 'Content-Type, Authorization' response.headers['Access-Control-Allow-Credentials'] = 'true' return response From 73f5649196f472d3719e2e7513e0a9d029cc3e38 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Wed, 3 Dec 2025 13:49:29 +1000 Subject: [PATCH 672/759] Implement temporal rolling VAE (Major VRAM reductions in Hunyuan and Kandinsky) (#10995) * hunyuan upsampler: rework imports Remove the transitive import of VideoConv3d and Resnet and takes these from actual implementation source. * model: remove unused give_pre_end According to git grep, this is not used now, and was not used in the initial commit that introduced it (see below). This semantic is difficult to implement temporal roll VAE for (and would defeat the purpose). Rather than implement the complex if, just delete the unused feature. (venv) rattus@rattus-box2:~/ComfyUI$ git log --oneline 220afe33 (HEAD) Initial commit. (venv) rattus@rattus-box2:~/ComfyUI$ git grep give_pre comfy/ldm/modules/diffusionmodules/model.py: resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False, comfy/ldm/modules/diffusionmodules/model.py: self.give_pre_end = give_pre_end comfy/ldm/modules/diffusionmodules/model.py: if self.give_pre_end: (venv) rattus@rattus-box2:~/ComfyUI$ git co origin/master Previous HEAD position was 220afe33 Initial commit. HEAD is now at 9d8a8179 Enable async offloading by default on Nvidia. (#10953) (venv) rattus@rattus-box2:~/ComfyUI$ git grep give_pre comfy/ldm/modules/diffusionmodules/model.py: resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False, comfy/ldm/modules/diffusionmodules/model.py: self.give_pre_end = give_pre_end comfy/ldm/modules/diffusionmodules/model.py: if self.give_pre_end: * move refiner VAE temporal roller to core Move the carrying conv op to the common VAE code and give it a better name. Roll the carry implementation logic for Resnet into the base class and scrap the Hunyuan specific subclass. * model: Add temporal roll to main VAE decoder If there are no attention layers, its a standard resnet and VideoConv3d is asked for, substitute in the temporal rolloing VAE algorithm. This reduces VAE usage by the temporal dimension (can be huge VRAM savings). * model: Add temporal roll to main VAE encoder If there are no attention layers, its a standard resnet and VideoConv3d is asked for, substitute in the temporal rolling VAE algorithm. This reduces VAE usage by the temporal dimension (can be huge VRAM savings). --- comfy/ldm/hunyuan_video/upsampler.py | 3 +- comfy/ldm/hunyuan_video/vae_refiner.py | 94 +++------ comfy/ldm/modules/diffusionmodules/model.py | 207 ++++++++++++++------ 3 files changed, 174 insertions(+), 130 deletions(-) diff --git a/comfy/ldm/hunyuan_video/upsampler.py b/comfy/ldm/hunyuan_video/upsampler.py index 9f5e91a59..85f515f67 100644 --- a/comfy/ldm/hunyuan_video/upsampler.py +++ b/comfy/ldm/hunyuan_video/upsampler.py @@ -1,7 +1,8 @@ import torch import torch.nn as nn import torch.nn.functional as F -from comfy.ldm.hunyuan_video.vae_refiner import RMS_norm, ResnetBlock, VideoConv3d +from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, VideoConv3d +from comfy.ldm.hunyuan_video.vae_refiner import RMS_norm import model_management, model_patcher class SRResidualCausalBlock3D(nn.Module): diff --git a/comfy/ldm/hunyuan_video/vae_refiner.py b/comfy/ldm/hunyuan_video/vae_refiner.py index 9f750dcc4..ddf77cd0e 100644 --- a/comfy/ldm/hunyuan_video/vae_refiner.py +++ b/comfy/ldm/hunyuan_video/vae_refiner.py @@ -1,42 +1,12 @@ import torch import torch.nn as nn import torch.nn.functional as F -from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d, Normalize +from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, CarriedConv3d, Normalize, conv_carry_causal_3d, torch_cat_if_needed import comfy.ops import comfy.ldm.models.autoencoder import comfy.model_management ops = comfy.ops.disable_weight_init -class NoPadConv3d(nn.Module): - def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding=0, **kwargs): - super().__init__() - self.conv = ops.Conv3d(n_channels, out_channels, kernel_size, stride=stride, dilation=dilation, **kwargs) - - def forward(self, x): - return self.conv(x) - - -def conv_carry_causal_3d(xl, op, conv_carry_in=None, conv_carry_out=None): - - x = xl[0] - xl.clear() - - if conv_carry_out is not None: - to_push = x[:, :, -2:, :, :].clone() - conv_carry_out.append(to_push) - - if isinstance(op, NoPadConv3d): - if conv_carry_in is None: - x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2, 0), mode = 'replicate') - else: - carry_len = conv_carry_in[0].shape[2] - x = torch.cat([conv_carry_in.pop(0), x], dim=2) - x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2 - carry_len, 0), mode = 'replicate') - - out = op(x) - - return out - class RMS_norm(nn.Module): def __init__(self, dim): @@ -49,7 +19,7 @@ class RMS_norm(nn.Module): return F.normalize(x, dim=1) * self.scale * comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device) class DnSmpl(nn.Module): - def __init__(self, ic, oc, tds=True, refiner_vae=True, op=VideoConv3d): + def __init__(self, ic, oc, tds, refiner_vae, op): super().__init__() fct = 2 * 2 * 2 if tds else 1 * 2 * 2 assert oc % fct == 0 @@ -109,7 +79,7 @@ class DnSmpl(nn.Module): class UpSmpl(nn.Module): - def __init__(self, ic, oc, tus=True, refiner_vae=True, op=VideoConv3d): + def __init__(self, ic, oc, tus, refiner_vae, op): super().__init__() fct = 2 * 2 * 2 if tus else 1 * 2 * 2 self.conv = op(ic, oc * fct, kernel_size=3, stride=1, padding=1) @@ -163,23 +133,6 @@ class UpSmpl(nn.Module): return h + x -class HunyuanRefinerResnetBlock(ResnetBlock): - def __init__(self, in_channels, out_channels, conv_op=NoPadConv3d, norm_op=RMS_norm): - super().__init__(in_channels=in_channels, out_channels=out_channels, temb_channels=0, conv_op=conv_op, norm_op=norm_op) - - def forward(self, x, conv_carry_in=None, conv_carry_out=None): - h = x - h = [ self.swish(self.norm1(x)) ] - h = conv_carry_causal_3d(h, self.conv1, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out) - - h = [ self.dropout(self.swish(self.norm2(h))) ] - h = conv_carry_causal_3d(h, self.conv2, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out) - - if self.in_channels != self.out_channels: - x = self.nin_shortcut(x) - - return x+h - class Encoder(nn.Module): def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks, ffactor_spatial, ffactor_temporal, downsample_match_channel=True, refiner_vae=True, **_): @@ -191,7 +144,7 @@ class Encoder(nn.Module): self.refiner_vae = refiner_vae if self.refiner_vae: - conv_op = NoPadConv3d + conv_op = CarriedConv3d norm_op = RMS_norm else: conv_op = ops.Conv3d @@ -206,9 +159,10 @@ class Encoder(nn.Module): for i, tgt in enumerate(block_out_channels): stage = nn.Module() - stage.block = nn.ModuleList([HunyuanRefinerResnetBlock(in_channels=ch if j == 0 else tgt, - out_channels=tgt, - conv_op=conv_op, norm_op=norm_op) + stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, + out_channels=tgt, + temb_channels=0, + conv_op=conv_op, norm_op=norm_op) for j in range(num_res_blocks)]) ch = tgt if i < depth: @@ -218,9 +172,9 @@ class Encoder(nn.Module): self.down.append(stage) self.mid = nn.Module() - self.mid.block_1 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op) + self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op) self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op) - self.mid.block_2 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op) + self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op) self.norm_out = norm_op(ch) self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1) @@ -246,22 +200,20 @@ class Encoder(nn.Module): conv_carry_out = [] if i == len(x) - 1: conv_carry_out = None + x1 = [ x1 ] x1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out) for stage in self.down: for blk in stage.block: - x1 = blk(x1, conv_carry_in, conv_carry_out) + x1 = blk(x1, None, conv_carry_in, conv_carry_out) if hasattr(stage, 'downsample'): x1 = stage.downsample(x1, conv_carry_in, conv_carry_out) out.append(x1) conv_carry_in = conv_carry_out - if len(out) > 1: - out = torch.cat(out, dim=2) - else: - out = out[0] + out = torch_cat_if_needed(out, dim=2) x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(out))) del out @@ -288,7 +240,7 @@ class Decoder(nn.Module): self.refiner_vae = refiner_vae if self.refiner_vae: - conv_op = NoPadConv3d + conv_op = CarriedConv3d norm_op = RMS_norm else: conv_op = ops.Conv3d @@ -298,9 +250,9 @@ class Decoder(nn.Module): self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1) self.mid = nn.Module() - self.mid.block_1 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op) + self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op) self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op) - self.mid.block_2 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op) + self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op) self.up = nn.ModuleList() depth = (ffactor_spatial >> 1).bit_length() @@ -308,9 +260,10 @@ class Decoder(nn.Module): for i, tgt in enumerate(block_out_channels): stage = nn.Module() - stage.block = nn.ModuleList([HunyuanRefinerResnetBlock(in_channels=ch if j == 0 else tgt, - out_channels=tgt, - conv_op=conv_op, norm_op=norm_op) + stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, + out_channels=tgt, + temb_channels=0, + conv_op=conv_op, norm_op=norm_op) for j in range(num_res_blocks + 1)]) ch = tgt if i < depth: @@ -340,7 +293,7 @@ class Decoder(nn.Module): conv_carry_out = None for stage in self.up: for blk in stage.block: - x1 = blk(x1, conv_carry_in, conv_carry_out) + x1 = blk(x1, None, conv_carry_in, conv_carry_out) if hasattr(stage, 'upsample'): x1 = stage.upsample(x1, conv_carry_in, conv_carry_out) @@ -350,10 +303,7 @@ class Decoder(nn.Module): conv_carry_in = conv_carry_out del x - if len(out) > 1: - out = torch.cat(out, dim=2) - else: - out = out[0] + out = torch_cat_if_needed(out, dim=2) if not self.refiner_vae: if z.shape[-3] == 1: diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index de1e01cc8..681a55db5 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -13,6 +13,12 @@ if model_management.xformers_enabled_vae(): import xformers import xformers.ops +def torch_cat_if_needed(xl, dim): + if len(xl) > 1: + return torch.cat(xl, dim) + else: + return xl[0] + def get_timestep_embedding(timesteps, embedding_dim): """ This matches the implementation in Denoising Diffusion Probabilistic Models: @@ -43,6 +49,37 @@ def Normalize(in_channels, num_groups=32): return ops.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True) +class CarriedConv3d(nn.Module): + def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding=0, **kwargs): + super().__init__() + self.conv = ops.Conv3d(n_channels, out_channels, kernel_size, stride=stride, dilation=dilation, **kwargs) + + def forward(self, x): + return self.conv(x) + + +def conv_carry_causal_3d(xl, op, conv_carry_in=None, conv_carry_out=None): + + x = xl[0] + xl.clear() + + if isinstance(op, CarriedConv3d): + if conv_carry_in is None: + x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2, 0), mode = 'replicate') + else: + carry_len = conv_carry_in[0].shape[2] + x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2 - carry_len, 0), mode = 'replicate') + x = torch.cat([conv_carry_in.pop(0), x], dim=2) + + if conv_carry_out is not None: + to_push = x[:, :, -2:, :, :].clone() + conv_carry_out.append(to_push) + + out = op(x) + + return out + + class VideoConv3d(nn.Module): def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding_mode='replicate', padding=1, **kwargs): super().__init__() @@ -89,29 +126,24 @@ class Upsample(nn.Module): stride=1, padding=1) - def forward(self, x): + def forward(self, x, conv_carry_in=None, conv_carry_out=None): scale_factor = self.scale_factor if isinstance(scale_factor, (int, float)): scale_factor = (scale_factor,) * (x.ndim - 2) if x.ndim == 5 and scale_factor[0] > 1.0: - t = x.shape[2] - if t > 1: - a, b = x.split((1, t - 1), dim=2) - del x - b = interpolate_up(b, scale_factor) - else: - a = x - - a = interpolate_up(a.squeeze(2), scale_factor=scale_factor[1:]).unsqueeze(2) - if t > 1: - x = torch.cat((a, b), dim=2) - else: - x = a + results = [] + if conv_carry_in is None: + first = x[:, :, :1, :, :] + results.append(interpolate_up(first.squeeze(2), scale_factor=scale_factor[1:]).unsqueeze(2)) + x = x[:, :, 1:, :, :] + if x.shape[2] > 0: + results.append(interpolate_up(x, scale_factor)) + x = torch_cat_if_needed(results, dim=2) else: x = interpolate_up(x, scale_factor) if self.with_conv: - x = self.conv(x) + x = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out) return x @@ -127,17 +159,20 @@ class Downsample(nn.Module): stride=stride, padding=0) - def forward(self, x): + def forward(self, x, conv_carry_in=None, conv_carry_out=None): if self.with_conv: - if x.ndim == 4: + if isinstance(self.conv, CarriedConv3d): + x = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out) + elif x.ndim == 4: pad = (0, 1, 0, 1) mode = "constant" x = torch.nn.functional.pad(x, pad, mode=mode, value=0) + x = self.conv(x) elif x.ndim == 5: pad = (1, 1, 1, 1, 2, 0) mode = "replicate" x = torch.nn.functional.pad(x, pad, mode=mode) - x = self.conv(x) + x = self.conv(x) else: x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) return x @@ -183,23 +218,23 @@ class ResnetBlock(nn.Module): stride=1, padding=0) - def forward(self, x, temb=None): + def forward(self, x, temb=None, conv_carry_in=None, conv_carry_out=None): h = x h = self.norm1(h) - h = self.swish(h) - h = self.conv1(h) + h = [ self.swish(h) ] + h = conv_carry_causal_3d(h, self.conv1, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out) if temb is not None: h = h + self.temb_proj(self.swish(temb))[:,:,None,None] h = self.norm2(h) h = self.swish(h) - h = self.dropout(h) - h = self.conv2(h) + h = [ self.dropout(h) ] + h = conv_carry_causal_3d(h, self.conv2, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out) if self.in_channels != self.out_channels: if self.use_conv_shortcut: - x = self.conv_shortcut(x) + x = conv_carry_causal_3d([x], self.conv_shortcut, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out) else: x = self.nin_shortcut(x) @@ -520,9 +555,14 @@ class Encoder(nn.Module): self.num_res_blocks = num_res_blocks self.resolution = resolution self.in_channels = in_channels + self.carried = False if conv3d: - conv_op = VideoConv3d + if not attn_resolutions: + conv_op = CarriedConv3d + self.carried = True + else: + conv_op = VideoConv3d mid_attn_conv_op = ops.Conv3d else: conv_op = ops.Conv2d @@ -535,6 +575,7 @@ class Encoder(nn.Module): stride=1, padding=1) + self.time_compress = 1 curr_res = resolution in_ch_mult = (1,)+tuple(ch_mult) self.in_ch_mult = in_ch_mult @@ -561,10 +602,15 @@ class Encoder(nn.Module): if time_compress is not None: if (self.num_resolutions - 1 - i_level) > math.log2(time_compress): stride = (1, 2, 2) + else: + self.time_compress *= 2 down.downsample = Downsample(block_in, resamp_with_conv, stride=stride, conv_op=conv_op) curr_res = curr_res // 2 self.down.append(down) + if time_compress is not None: + self.time_compress = time_compress + # middle self.mid = nn.Module() self.mid.block_1 = ResnetBlock(in_channels=block_in, @@ -590,15 +636,42 @@ class Encoder(nn.Module): def forward(self, x): # timestep embedding temb = None - # downsampling - h = self.conv_in(x) - for i_level in range(self.num_resolutions): - for i_block in range(self.num_res_blocks): - h = self.down[i_level].block[i_block](h, temb) - if len(self.down[i_level].attn) > 0: - h = self.down[i_level].attn[i_block](h) - if i_level != self.num_resolutions-1: - h = self.down[i_level].downsample(h) + + if self.carried: + xl = [x[:, :, :1, :, :]] + if x.shape[2] > self.time_compress: + tc = self.time_compress + xl += torch.split(x[:, :, 1: 1 + ((x.shape[2] - 1) // tc) * tc, :, :], tc * 2, dim = 2) + x = xl + else: + x = [x] + out = [] + + conv_carry_in = None + + for i, x1 in enumerate(x): + conv_carry_out = [] + if i == len(x) - 1: + conv_carry_out = None + + # downsampling + x1 = [ x1 ] + h1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out) + + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h1 = self.down[i_level].block[i_block](h1, temb, conv_carry_in, conv_carry_out) + if len(self.down[i_level].attn) > 0: + assert i == 0 #carried should not happen if attn exists + h1 = self.down[i_level].attn[i_block](h1) + if i_level != self.num_resolutions-1: + h1 = self.down[i_level].downsample(h1, conv_carry_in, conv_carry_out) + + out.append(h1) + conv_carry_in = conv_carry_out + + h = torch_cat_if_needed(out, dim=2) + del out # middle h = self.mid.block_1(h, temb) @@ -607,15 +680,15 @@ class Encoder(nn.Module): # end h = self.norm_out(h) - h = nonlinearity(h) - h = self.conv_out(h) + h = [ nonlinearity(h) ] + h = conv_carry_causal_3d(h, self.conv_out) return h class Decoder(nn.Module): def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks, attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels, - resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False, + resolution, z_channels, tanh_out=False, use_linear_attn=False, conv_out_op=ops.Conv2d, resnet_op=ResnetBlock, attn_op=AttnBlock, @@ -629,12 +702,18 @@ class Decoder(nn.Module): self.num_res_blocks = num_res_blocks self.resolution = resolution self.in_channels = in_channels - self.give_pre_end = give_pre_end self.tanh_out = tanh_out + self.carried = False if conv3d: - conv_op = VideoConv3d - conv_out_op = VideoConv3d + if not attn_resolutions and resnet_op == ResnetBlock: + conv_op = CarriedConv3d + conv_out_op = CarriedConv3d + self.carried = True + else: + conv_op = VideoConv3d + conv_out_op = VideoConv3d + mid_attn_conv_op = ops.Conv3d else: conv_op = ops.Conv2d @@ -709,29 +788,43 @@ class Decoder(nn.Module): temb = None # z to block_in - h = self.conv_in(z) + h = conv_carry_causal_3d([z], self.conv_in) # middle h = self.mid.block_1(h, temb, **kwargs) h = self.mid.attn_1(h, **kwargs) h = self.mid.block_2(h, temb, **kwargs) + if self.carried: + h = torch.split(h, 2, dim=2) + else: + h = [ h ] + out = [] + + conv_carry_in = None + # upsampling - for i_level in reversed(range(self.num_resolutions)): - for i_block in range(self.num_res_blocks+1): - h = self.up[i_level].block[i_block](h, temb, **kwargs) - if len(self.up[i_level].attn) > 0: - h = self.up[i_level].attn[i_block](h, **kwargs) - if i_level != 0: - h = self.up[i_level].upsample(h) + for i, h1 in enumerate(h): + conv_carry_out = [] + if i == len(h) - 1: + conv_carry_out = None + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks+1): + h1 = self.up[i_level].block[i_block](h1, temb, conv_carry_in, conv_carry_out, **kwargs) + if len(self.up[i_level].attn) > 0: + assert i == 0 #carried should not happen if attn exists + h1 = self.up[i_level].attn[i_block](h1, **kwargs) + if i_level != 0: + h1 = self.up[i_level].upsample(h1, conv_carry_in, conv_carry_out) - # end - if self.give_pre_end: - return h + h1 = self.norm_out(h1) + h1 = [ nonlinearity(h1) ] + h1 = conv_carry_causal_3d(h1, self.conv_out, conv_carry_in, conv_carry_out) + if self.tanh_out: + h1 = torch.tanh(h1) + out.append(h1) + conv_carry_in = conv_carry_out - h = self.norm_out(h) - h = nonlinearity(h) - h = self.conv_out(h, **kwargs) - if self.tanh_out: - h = torch.tanh(h) - return h + out = torch_cat_if_needed(out, dim=2) + + return out From c120eee5bacca643062657d2a7efad83c7d4d828 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Tue, 2 Dec 2025 21:17:13 -0800 Subject: [PATCH 673/759] Add MatchType, DynamicCombo, and Autogrow support to V3 Schema (#10832) * Added output_matchtypes to generated json for v3, initial backend support for MatchType, created nodes_logic.py and added SwitchNode * Fixed providing list of allowed_types * Add workaround in validation.py for V3 Combo outputs not working as Combo inputs * Make match type receive_type pass validation * Also add MatchType check to input_type in validation - will likely trigger when connecting to non-lazy stuff * Make sure this PR only has MatchType stuff * Initial work on DynamicCombo * Add get_dynamic function, not yet filled out correctly * Mark Switch node as Beta * Make sure other unfinished dynamic types are not accidentally used * Send DynamicCombo.Option inputs in the same format as normal v1 inputs * add dynamic combo test node * Support validation of inputs and outputs * Add missing input params to DynamicCombo.Input * Add get_all function to inputs for id validation purposes * Fix imports for v3 returning everything when doing io/ui/IO/UI instead of what is in __all__ of _io.py and _ui.py * Modifying behavior of get_dynamic in V3 + serialization so can be used in execution code * Fix v3 schema validation code after changes * Refactor hidden_values for v3 in execution.py to be more general v3_data, add helper functions for dynamic behavior, preparing for restructuring dynamic type into object (not finished yet) * Add nesting of inputs on DynamicCombo during execution * Work with latest frontend commits * Fix cringe arrows * frontend will no longer namespace dynamic inputs widgets so reflect that in code, refactor build_nested_inputs * Prepare Autogrow support for the love of the game * satisfy ruff * Create test nodes for Autogrow to collab with frontend development * Add nested combo to DCTestNode * Remove array support from build_nested_inputs, properly handle missing expected values * Make execution.validate_inputs properly validate required dynamic inputs, renamed dynamic_data to dynamic_paths for clarity * MatchType does not need any DynamicInput/Output features on backend; will increase compatibility with dynamic types * Probably need this for ruff check * Change MatchType to have template be the first and only required param; output id's do nothing right now, so no need * Fix merge regression with LatentUpscaleModel type not being put in __all__ for _io.py, fix invalid type hint for validate_inputs * Make Switch node inputs optional, disallow both inputs from being missing, and still work properly with lazy; when one input is missing, use the other no matter what the switch is set to * Satisfy ruff * Move MatchType code above the types that inherit from DynamicInput * Add DynamicSlot type, awaiting frontend support * Make curr_prefix creation happen in Autogrow, move curr_prefix in DynamicCombo to only be created if input exists in live_inputs * I was confused, fixing accidentally redundant curr_prefix addition in Autogrow * Make sure Autogrow inputs are force_input = True when WidgetInput, fix runtime validation by removing original input from expected inputs, fix min/max bounds, change test nodes slightly * Remove unnecessary id usage in Autogrow test node outputs * Commented out Switch node + test nodes * Remove commented out code from Autogrow * Make TemplatePrefix max more clear, allow max == 1 * Replace all dict[str] with dict[str, Any] * Renamed add_to_dict_live_inputs to expand_schema_for_dynamic * Fixed typo in DynamicSlot input code * note about live_inputs not being present soon in get_v1_info (internal function anyway) * For now, hide DynamicCombo and Autogrow from public interface * Removed comment --- comfy_api/latest/__init__.py | 4 +- comfy_api/latest/_io.py | 416 ++++++++++++++++++++++++++------- comfy_api/latest/_io_public.py | 1 + comfy_api/latest/_ui_public.py | 1 + comfy_api/v0_0_2/__init__.py | 6 +- comfy_execution/validation.py | 6 + comfy_extras/nodes_logic.py | 155 ++++++++++++ execution.py | 40 ++-- nodes.py | 1 + 9 files changed, 525 insertions(+), 105 deletions(-) create mode 100644 comfy_api/latest/_io_public.py create mode 100644 comfy_api/latest/_ui_public.py create mode 100644 comfy_extras/nodes_logic.py diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py index 176ae36e0..0fa01d1e7 100644 --- a/comfy_api/latest/__init__.py +++ b/comfy_api/latest/__init__.py @@ -8,8 +8,8 @@ from comfy_api.internal.async_to_sync import create_sync_class from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL -from . import _io as io -from . import _ui as ui +from . import _io_public as io +from . import _ui_public as ui # from comfy_api.latest._resources import _RESOURCES as resources #noqa: F401 from comfy_execution.utils import get_executing_context from comfy_execution.progress import get_progress_state, PreviewImageTuple diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 79c0722a9..257f07c42 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -4,6 +4,7 @@ import copy import inspect from abc import ABC, abstractmethod from collections import Counter +from collections.abc import Iterable from dataclasses import asdict, dataclass from enum import Enum from typing import Any, Callable, Literal, TypedDict, TypeVar, TYPE_CHECKING @@ -150,6 +151,9 @@ class _IO_V3: def __init__(self): pass + def validate(self): + pass + @property def io_type(self): return self.Parent.io_type @@ -182,6 +186,9 @@ class Input(_IO_V3): def get_io_type(self): return _StringIOType(self.io_type) + def get_all(self) -> list[Input]: + return [self] + class WidgetInput(Input): ''' Base class for a V3 Input with widget. @@ -814,13 +821,61 @@ class MultiType: else: return super().as_dict() +@comfytype(io_type="COMFY_MATCHTYPE_V3") +class MatchType(ComfyTypeIO): + class Template: + def __init__(self, template_id: str, allowed_types: _ComfyType | list[_ComfyType] = AnyType): + self.template_id = template_id + # account for syntactic sugar + if not isinstance(allowed_types, Iterable): + allowed_types = [allowed_types] + for t in allowed_types: + if not isinstance(t, type): + if not isinstance(t, _ComfyType): + raise ValueError(f"Allowed types must be a ComfyType or a list of ComfyTypes, got {t.__class__.__name__}") + else: + if not issubclass(t, _ComfyType): + raise ValueError(f"Allowed types must be a ComfyType or a list of ComfyTypes, got {t.__name__}") + self.allowed_types = allowed_types + + def as_dict(self): + return { + "template_id": self.template_id, + "allowed_types": ",".join([t.io_type for t in self.allowed_types]), + } + + class Input(Input): + def __init__(self, id: str, template: MatchType.Template, + display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None): + super().__init__(id, display_name, optional, tooltip, lazy, extra_dict) + self.template = template + + def as_dict(self): + return super().as_dict() | prune_dict({ + "template": self.template.as_dict(), + }) + + class Output(Output): + def __init__(self, template: MatchType.Template, id: str=None, display_name: str=None, tooltip: str=None, + is_output_list=False): + super().__init__(id, display_name, tooltip, is_output_list) + self.template = template + + def as_dict(self): + return super().as_dict() | prune_dict({ + "template": self.template.as_dict(), + }) + class DynamicInput(Input, ABC): ''' Abstract class for dynamic input registration. ''' - @abstractmethod def get_dynamic(self) -> list[Input]: - ... + return [] + + def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''): + pass + class DynamicOutput(Output, ABC): ''' @@ -830,99 +885,223 @@ class DynamicOutput(Output, ABC): is_output_list=False): super().__init__(id, display_name, tooltip, is_output_list) - @abstractmethod def get_dynamic(self) -> list[Output]: - ... + return [] @comfytype(io_type="COMFY_AUTOGROW_V3") -class AutogrowDynamic(ComfyTypeI): - Type = list[Any] - class Input(DynamicInput): - def __init__(self, id: str, template_input: Input, min: int=1, max: int=None, - display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None): - super().__init__(id, display_name, optional, tooltip, lazy, extra_dict) - self.template_input = template_input - if min is not None: - assert(min >= 1) - if max is not None: - assert(max >= 1) +class Autogrow(ComfyTypeI): + Type = dict[str, Any] + _MaxNames = 100 # NOTE: max 100 names for sanity + + class _AutogrowTemplate: + def __init__(self, input: Input): + # dynamic inputs are not allowed as the template input + assert(not isinstance(input, DynamicInput)) + self.input = copy.copy(input) + if isinstance(self.input, WidgetInput): + self.input.force_input = True + self.names: list[str] = [] + self.cached_inputs = {} + + def _create_input(self, input: Input, name: str): + new_input = copy.copy(self.input) + new_input.id = name + return new_input + + def _create_cached_inputs(self): + for name in self.names: + self.cached_inputs[name] = self._create_input(self.input, name) + + def get_all(self) -> list[Input]: + return list(self.cached_inputs.values()) + + def as_dict(self): + return prune_dict({ + "input": create_input_dict_v1([self.input]), + }) + + def validate(self): + self.input.validate() + + def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''): + real_inputs = [] + for name, input in self.cached_inputs.items(): + if name in live_inputs: + real_inputs.append(input) + add_to_input_dict_v1(d, real_inputs, live_inputs, curr_prefix) + add_dynamic_id_mapping(d, real_inputs, curr_prefix) + + class TemplatePrefix(_AutogrowTemplate): + def __init__(self, input: Input, prefix: str, min: int=1, max: int=10): + super().__init__(input) + self.prefix = prefix + assert(min >= 0) + assert(max >= 1) + assert(max <= Autogrow._MaxNames) self.min = min self.max = max + self.names = [f"{self.prefix}{i}" for i in range(self.max)] + self._create_cached_inputs() + + def as_dict(self): + return super().as_dict() | prune_dict({ + "prefix": self.prefix, + "min": self.min, + "max": self.max, + }) + + class TemplateNames(_AutogrowTemplate): + def __init__(self, input: Input, names: list[str], min: int=1): + super().__init__(input) + self.names = names[:Autogrow._MaxNames] + assert(min >= 0) + self.min = min + self._create_cached_inputs() + + def as_dict(self): + return super().as_dict() | prune_dict({ + "names": self.names, + "min": self.min, + }) + + class Input(DynamicInput): + def __init__(self, id: str, template: Autogrow.TemplatePrefix | Autogrow.TemplateNames, + display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None): + super().__init__(id, display_name, optional, tooltip, lazy, extra_dict) + self.template = template + + def as_dict(self): + return super().as_dict() | prune_dict({ + "template": self.template.as_dict(), + }) def get_dynamic(self) -> list[Input]: - curr_count = 1 - new_inputs = [] - for i in range(self.min): - new_input = copy.copy(self.template_input) - new_input.id = f"{new_input.id}{curr_count}_${self.id}_ag$" - if new_input.display_name is not None: - new_input.display_name = f"{new_input.display_name}{curr_count}" - new_input.optional = self.optional or new_input.optional - if isinstance(self.template_input, WidgetInput): - new_input.force_input = True - new_inputs.append(new_input) - curr_count += 1 - # pretend to expand up to max - for i in range(curr_count-1, self.max): - new_input = copy.copy(self.template_input) - new_input.id = f"{new_input.id}{curr_count}_${self.id}_ag$" - if new_input.display_name is not None: - new_input.display_name = f"{new_input.display_name}{curr_count}" - new_input.optional = True - if isinstance(self.template_input, WidgetInput): - new_input.force_input = True - new_inputs.append(new_input) - curr_count += 1 - return new_inputs + return self.template.get_all() -@comfytype(io_type="COMFY_COMBODYNAMIC_V3") -class ComboDynamic(ComfyTypeI): - class Input(DynamicInput): - def __init__(self, id: str): - pass + def get_all(self) -> list[Input]: + return [self] + self.template.get_all() -@comfytype(io_type="COMFY_MATCHTYPE_V3") -class MatchType(ComfyTypeIO): - class Template: - def __init__(self, template_id: str, allowed_types: _ComfyType | list[_ComfyType]): - self.template_id = template_id - self.allowed_types = [allowed_types] if isinstance(allowed_types, _ComfyType) else allowed_types + def validate(self): + self.template.validate() + + def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''): + curr_prefix = f"{curr_prefix}{self.id}." + # need to remove self from expected inputs dictionary; replaced by template inputs in frontend + for inner_dict in d.values(): + if self.id in inner_dict: + del inner_dict[self.id] + self.template.expand_schema_for_dynamic(d, live_inputs, curr_prefix) + +@comfytype(io_type="COMFY_DYNAMICCOMBO_V3") +class DynamicCombo(ComfyTypeI): + Type = dict[str, Any] + + class Option: + def __init__(self, key: str, inputs: list[Input]): + self.key = key + self.inputs = inputs def as_dict(self): return { - "template_id": self.template_id, - "allowed_types": "".join(t.io_type for t in self.allowed_types), + "key": self.key, + "inputs": create_input_dict_v1(self.inputs), } class Input(DynamicInput): - def __init__(self, id: str, template: MatchType.Template, + def __init__(self, id: str, options: list[DynamicCombo.Option], display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None): super().__init__(id, display_name, optional, tooltip, lazy, extra_dict) - self.template = template + self.options = options + + def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''): + # check if dynamic input's id is in live_inputs + if self.id in live_inputs: + curr_prefix = f"{curr_prefix}{self.id}." + key = live_inputs[self.id] + selected_option = None + for option in self.options: + if option.key == key: + selected_option = option + break + if selected_option is not None: + add_to_input_dict_v1(d, selected_option.inputs, live_inputs, curr_prefix) + add_dynamic_id_mapping(d, selected_option.inputs, curr_prefix, self) def get_dynamic(self) -> list[Input]: - return [self] + return [input for option in self.options for input in option.inputs] + + def get_all(self) -> list[Input]: + return [self] + [input for option in self.options for input in option.inputs] def as_dict(self): return super().as_dict() | prune_dict({ - "template": self.template.as_dict(), + "options": [o.as_dict() for o in self.options], }) - class Output(DynamicOutput): - def __init__(self, id: str, template: MatchType.Template, display_name: str=None, tooltip: str=None, - is_output_list=False): - super().__init__(id, display_name, tooltip, is_output_list) - self.template = template + def validate(self): + # make sure all nested inputs are validated + for option in self.options: + for input in option.inputs: + input.validate() - def get_dynamic(self) -> list[Output]: - return [self] +@comfytype(io_type="COMFY_DYNAMICSLOT_V3") +class DynamicSlot(ComfyTypeI): + Type = dict[str, Any] + + class Input(DynamicInput): + def __init__(self, slot: Input, inputs: list[Input], + display_name: str=None, tooltip: str=None, lazy: bool=None, extra_dict=None): + assert(not isinstance(slot, DynamicInput)) + self.slot = copy.copy(slot) + self.slot.display_name = slot.display_name if slot.display_name is not None else display_name + optional = True + self.slot.tooltip = slot.tooltip if slot.tooltip is not None else tooltip + self.slot.lazy = slot.lazy if slot.lazy is not None else lazy + self.slot.extra_dict = slot.extra_dict if slot.extra_dict is not None else extra_dict + super().__init__(slot.id, self.slot.display_name, optional, self.slot.tooltip, self.slot.lazy, self.slot.extra_dict) + self.inputs = inputs + self.force_input = None + # force widget inputs to have no widgets, otherwise this would be awkward + if isinstance(self.slot, WidgetInput): + self.force_input = True + self.slot.force_input = True + + def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''): + if self.id in live_inputs: + curr_prefix = f"{curr_prefix}{self.id}." + add_to_input_dict_v1(d, self.inputs, live_inputs, curr_prefix) + add_dynamic_id_mapping(d, [self.slot] + self.inputs, curr_prefix) + + def get_dynamic(self) -> list[Input]: + return [self.slot] + self.inputs + + def get_all(self) -> list[Input]: + return [self] + [self.slot] + self.inputs def as_dict(self): return super().as_dict() | prune_dict({ - "template": self.template.as_dict(), + "slotType": str(self.slot.get_io_type()), + "inputs": create_input_dict_v1(self.inputs), + "forceInput": self.force_input, }) + def validate(self): + self.slot.validate() + for input in self.inputs: + input.validate() + +def add_dynamic_id_mapping(d: dict[str, Any], inputs: list[Input], curr_prefix: str, self: DynamicInput=None): + dynamic = d.setdefault("dynamic_paths", {}) + if self is not None: + dynamic[self.id] = f"{curr_prefix}{self.id}" + for i in inputs: + if not isinstance(i, DynamicInput): + dynamic[f"{i.id}"] = f"{curr_prefix}{i.id}" + +class V3Data(TypedDict): + hidden_inputs: dict[str, Any] + dynamic_paths: dict[str, Any] class HiddenHolder: def __init__(self, unique_id: str, prompt: Any, @@ -984,6 +1163,7 @@ class NodeInfoV1: output_is_list: list[bool]=None output_name: list[str]=None output_tooltips: list[str]=None + output_matchtypes: list[str]=None name: str=None display_name: str=None description: str=None @@ -1061,7 +1241,11 @@ class Schema: '''Validate the schema: - verify ids on inputs and outputs are unique - both internally and in relation to each other ''' - input_ids = [i.id for i in self.inputs] if self.inputs is not None else [] + nested_inputs: list[Input] = [] + if self.inputs is not None: + for input in self.inputs: + nested_inputs.extend(input.get_all()) + input_ids = [i.id for i in nested_inputs] if nested_inputs is not None else [] output_ids = [o.id for o in self.outputs] if self.outputs is not None else [] input_set = set(input_ids) output_set = set(output_ids) @@ -1077,6 +1261,13 @@ class Schema: issues.append(f"Ids must be unique between inputs and outputs, but {intersection} are not.") if len(issues) > 0: raise ValueError("\n".join(issues)) + # validate inputs and outputs + if self.inputs is not None: + for input in self.inputs: + input.validate() + if self.outputs is not None: + for output in self.outputs: + output.validate() def finalize(self): """Add hidden based on selected schema options, and give outputs without ids default ids.""" @@ -1102,19 +1293,10 @@ class Schema: if output.id is None: output.id = f"_{i}_{output.io_type}_" - def get_v1_info(self, cls) -> NodeInfoV1: + def get_v1_info(self, cls, live_inputs: dict[str, Any]=None) -> NodeInfoV1: + # NOTE: live_inputs will not be used anymore very soon and this will be done another way # get V1 inputs - input = { - "required": {} - } - if self.inputs: - for i in self.inputs: - if isinstance(i, DynamicInput): - dynamic_inputs = i.get_dynamic() - for d in dynamic_inputs: - add_to_dict_v1(d, input) - else: - add_to_dict_v1(i, input) + input = create_input_dict_v1(self.inputs, live_inputs) if self.hidden: for hidden in self.hidden: input.setdefault("hidden", {})[hidden.name] = (hidden.value,) @@ -1123,12 +1305,24 @@ class Schema: output_is_list = [] output_name = [] output_tooltips = [] + output_matchtypes = [] + any_matchtypes = False if self.outputs: for o in self.outputs: output.append(o.io_type) output_is_list.append(o.is_output_list) output_name.append(o.display_name if o.display_name else o.io_type) output_tooltips.append(o.tooltip if o.tooltip else None) + # special handling for MatchType + if isinstance(o, MatchType.Output): + output_matchtypes.append(o.template.template_id) + any_matchtypes = True + else: + output_matchtypes.append(None) + + # clear out lists that are all None + if not any_matchtypes: + output_matchtypes = None info = NodeInfoV1( input=input, @@ -1137,6 +1331,7 @@ class Schema: output_is_list=output_is_list, output_name=output_name, output_tooltips=output_tooltips, + output_matchtypes=output_matchtypes, name=self.node_id, display_name=self.display_name, category=self.category, @@ -1182,16 +1377,57 @@ class Schema: return info -def add_to_dict_v1(i: Input, input: dict): +def create_input_dict_v1(inputs: list[Input], live_inputs: dict[str, Any]=None) -> dict: + input = { + "required": {} + } + add_to_input_dict_v1(input, inputs, live_inputs) + return input + +def add_to_input_dict_v1(d: dict[str, Any], inputs: list[Input], live_inputs: dict[str, Any]=None, curr_prefix=''): + for i in inputs: + if isinstance(i, DynamicInput): + add_to_dict_v1(i, d) + if live_inputs is not None: + i.expand_schema_for_dynamic(d, live_inputs, curr_prefix) + else: + add_to_dict_v1(i, d) + +def add_to_dict_v1(i: Input, d: dict, dynamic_dict: dict=None): key = "optional" if i.optional else "required" as_dict = i.as_dict() # for v1, we don't want to include the optional key as_dict.pop("optional", None) - input.setdefault(key, {})[i.id] = (i.get_io_type(), as_dict) + if dynamic_dict is None: + value = (i.get_io_type(), as_dict) + else: + value = (i.get_io_type(), as_dict, dynamic_dict) + d.setdefault(key, {})[i.id] = value def add_to_dict_v3(io: Input | Output, d: dict): d[io.id] = (io.get_io_type(), io.as_dict()) +def build_nested_inputs(values: dict[str, Any], v3_data: V3Data): + paths = v3_data.get("dynamic_paths", None) + if paths is None: + return values + values = values.copy() + result = {} + + for key, path in paths.items(): + parts = path.split(".") + current = result + + for i, p in enumerate(parts): + is_last = (i == len(parts) - 1) + + if is_last: + current[p] = values.pop(key, None) + else: + current = current.setdefault(p, {}) + + values.update(result) + return values class _ComfyNodeBaseInternal(_ComfyNodeInternal): @@ -1311,12 +1547,12 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal): @final @classmethod - def PREPARE_CLASS_CLONE(cls, hidden_inputs: dict) -> type[ComfyNode]: + def PREPARE_CLASS_CLONE(cls, v3_data: V3Data) -> type[ComfyNode]: """Creates clone of real node class to prevent monkey-patching.""" c_type: type[ComfyNode] = cls if is_class(cls) else type(cls) type_clone: type[ComfyNode] = shallow_clone_class(c_type) # set hidden - type_clone.hidden = HiddenHolder.from_dict(hidden_inputs) + type_clone.hidden = HiddenHolder.from_dict(v3_data["hidden_inputs"]) return type_clone @final @@ -1433,14 +1669,18 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal): @final @classmethod - def INPUT_TYPES(cls, include_hidden=True, return_schema=False) -> dict[str, dict] | tuple[dict[str, dict], Schema]: + def INPUT_TYPES(cls, include_hidden=True, return_schema=False, live_inputs=None) -> dict[str, dict] | tuple[dict[str, dict], Schema, V3Data]: schema = cls.FINALIZE_SCHEMA() - info = schema.get_v1_info(cls) + info = schema.get_v1_info(cls, live_inputs) input = info.input if not include_hidden: input.pop("hidden", None) if return_schema: - return input, schema + v3_data: V3Data = {} + dynamic = input.pop("dynamic_paths", None) + if dynamic is not None: + v3_data["dynamic_paths"] = dynamic + return input, schema, v3_data return input @final @@ -1513,7 +1753,7 @@ class ComfyNode(_ComfyNodeBaseInternal): raise NotImplementedError @classmethod - def validate_inputs(cls, **kwargs) -> bool: + def validate_inputs(cls, **kwargs) -> bool | str: """Optionally, define this function to validate inputs; equivalent to V1's VALIDATE_INPUTS.""" raise NotImplementedError @@ -1628,6 +1868,7 @@ __all__ = [ "StyleModel", "Gligen", "UpscaleModel", + "LatentUpscaleModel", "Audio", "Video", "SVG", @@ -1651,6 +1892,10 @@ __all__ = [ "SEGS", "AnyType", "MultiType", + # Dynamic Types + "MatchType", + # "DynamicCombo", + # "Autogrow", # Other classes "HiddenHolder", "Hidden", @@ -1661,4 +1906,5 @@ __all__ = [ "NodeOutput", "add_to_dict_v1", "add_to_dict_v3", + "V3Data", ] diff --git a/comfy_api/latest/_io_public.py b/comfy_api/latest/_io_public.py new file mode 100644 index 000000000..43c7680f3 --- /dev/null +++ b/comfy_api/latest/_io_public.py @@ -0,0 +1 @@ +from ._io import * # noqa: F403 diff --git a/comfy_api/latest/_ui_public.py b/comfy_api/latest/_ui_public.py new file mode 100644 index 000000000..85b11d78b --- /dev/null +++ b/comfy_api/latest/_ui_public.py @@ -0,0 +1 @@ +from ._ui import * # noqa: F403 diff --git a/comfy_api/v0_0_2/__init__.py b/comfy_api/v0_0_2/__init__.py index de0f95001..c4fa1d971 100644 --- a/comfy_api/v0_0_2/__init__.py +++ b/comfy_api/v0_0_2/__init__.py @@ -6,7 +6,7 @@ from comfy_api.latest import ( ) from typing import Type, TYPE_CHECKING from comfy_api.internal.async_to_sync import create_sync_class -from comfy_api.latest import io, ui, ComfyExtension #noqa: F401 +from comfy_api.latest import io, ui, IO, UI, ComfyExtension #noqa: F401 class ComfyAPIAdapter_v0_0_2(ComfyAPI_latest): @@ -42,4 +42,8 @@ __all__ = [ "InputImpl", "Types", "ComfyExtension", + "io", + "IO", + "ui", + "UI", ] diff --git a/comfy_execution/validation.py b/comfy_execution/validation.py index cec105fc9..24c0b4ed7 100644 --- a/comfy_execution/validation.py +++ b/comfy_execution/validation.py @@ -1,4 +1,5 @@ from __future__ import annotations +from comfy_api.latest import IO def validate_node_input( @@ -23,6 +24,11 @@ def validate_node_input( if not received_type != input_type: return True + # If the received type or input_type is a MatchType, we can return True immediately; + # validation for this is handled by the frontend + if received_type == IO.MatchType.io_type or input_type == IO.MatchType.io_type: + return True + # Not equal, and not strings if not isinstance(received_type, str) or not isinstance(input_type, str): return False diff --git a/comfy_extras/nodes_logic.py b/comfy_extras/nodes_logic.py new file mode 100644 index 000000000..95a6ba788 --- /dev/null +++ b/comfy_extras/nodes_logic.py @@ -0,0 +1,155 @@ +from typing import TypedDict +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io +from comfy_api.latest import _io + + + +class SwitchNode(io.ComfyNode): + @classmethod + def define_schema(cls): + template = io.MatchType.Template("switch") + return io.Schema( + node_id="ComfySwitchNode", + display_name="Switch", + category="logic", + is_experimental=True, + inputs=[ + io.Boolean.Input("switch"), + io.MatchType.Input("on_false", template=template, lazy=True, optional=True), + io.MatchType.Input("on_true", template=template, lazy=True, optional=True), + ], + outputs=[ + io.MatchType.Output(template=template, display_name="output"), + ], + ) + + @classmethod + def check_lazy_status(cls, switch, on_false=..., on_true=...): + # We use ... instead of None, as None is passed for connected-but-unevaluated inputs. + # This trick allows us to ignore the value of the switch and still be able to run execute(). + + # One of the inputs may be missing, in which case we need to evaluate the other input + if on_false is ...: + return ["on_true"] + if on_true is ...: + return ["on_false"] + # Normal lazy switch operation + if switch and on_true is None: + return ["on_true"] + if not switch and on_false is None: + return ["on_false"] + + @classmethod + def validate_inputs(cls, switch, on_false=..., on_true=...): + # This check happens before check_lazy_status(), so we can eliminate the case where + # both inputs are missing. + if on_false is ... and on_true is ...: + return "At least one of on_false or on_true must be connected to Switch node" + return True + + @classmethod + def execute(cls, switch, on_true=..., on_false=...) -> io.NodeOutput: + if on_true is ...: + return io.NodeOutput(on_false) + if on_false is ...: + return io.NodeOutput(on_true) + return io.NodeOutput(on_true if switch else on_false) + + +class DCTestNode(io.ComfyNode): + class DCValues(TypedDict): + combo: str + string: str + integer: int + image: io.Image.Type + subcombo: dict[str] + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="DCTestNode", + display_name="DCTest", + category="logic", + is_output_node=True, + inputs=[_io.DynamicCombo.Input("combo", options=[ + _io.DynamicCombo.Option("option1", [io.String.Input("string")]), + _io.DynamicCombo.Option("option2", [io.Int.Input("integer")]), + _io.DynamicCombo.Option("option3", [io.Image.Input("image")]), + _io.DynamicCombo.Option("option4", [ + _io.DynamicCombo.Input("subcombo", options=[ + _io.DynamicCombo.Option("opt1", [io.Float.Input("float_x"), io.Float.Input("float_y")]), + _io.DynamicCombo.Option("opt2", [io.Mask.Input("mask1", optional=True)]), + ]) + ])] + )], + outputs=[io.AnyType.Output()], + ) + + @classmethod + def execute(cls, combo: DCValues) -> io.NodeOutput: + combo_val = combo["combo"] + if combo_val == "option1": + return io.NodeOutput(combo["string"]) + elif combo_val == "option2": + return io.NodeOutput(combo["integer"]) + elif combo_val == "option3": + return io.NodeOutput(combo["image"]) + elif combo_val == "option4": + return io.NodeOutput(f"{combo['subcombo']}") + else: + raise ValueError(f"Invalid combo: {combo_val}") + + +class AutogrowNamesTestNode(io.ComfyNode): + @classmethod + def define_schema(cls): + template = _io.Autogrow.TemplateNames(input=io.Float.Input("float"), names=["a", "b", "c"]) + return io.Schema( + node_id="AutogrowNamesTestNode", + display_name="AutogrowNamesTest", + category="logic", + inputs=[ + _io.Autogrow.Input("autogrow", template=template) + ], + outputs=[io.String.Output()], + ) + + @classmethod + def execute(cls, autogrow: _io.Autogrow.Type) -> io.NodeOutput: + vals = list(autogrow.values()) + combined = ",".join([str(x) for x in vals]) + return io.NodeOutput(combined) + +class AutogrowPrefixTestNode(io.ComfyNode): + @classmethod + def define_schema(cls): + template = _io.Autogrow.TemplatePrefix(input=io.Float.Input("float"), prefix="float", min=1, max=10) + return io.Schema( + node_id="AutogrowPrefixTestNode", + display_name="AutogrowPrefixTest", + category="logic", + inputs=[ + _io.Autogrow.Input("autogrow", template=template) + ], + outputs=[io.String.Output()], + ) + + @classmethod + def execute(cls, autogrow: _io.Autogrow.Type) -> io.NodeOutput: + vals = list(autogrow.values()) + combined = ",".join([str(x) for x in vals]) + return io.NodeOutput(combined) + +class LogicExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + # SwitchNode, + # DCTestNode, + # AutogrowNamesTestNode, + # AutogrowPrefixTestNode, + ] + +async def comfy_entrypoint() -> LogicExtension: + return LogicExtension() diff --git a/execution.py b/execution.py index 17c77beab..c2186ac98 100644 --- a/execution.py +++ b/execution.py @@ -34,7 +34,7 @@ from comfy_execution.validation import validate_node_input from comfy_execution.progress import get_progress_state, reset_progress_state, add_progress_handler, WebUIProgressHandler from comfy_execution.utils import CurrentNodeContext from comfy_api.internal import _ComfyNodeInternal, _NodeOutputInternal, first_real_override, is_class, make_locked_method_func -from comfy_api.latest import io +from comfy_api.latest import io, _io class ExecutionResult(Enum): @@ -76,7 +76,7 @@ class IsChangedCache: return self.is_changed[node_id] # Intentionally do not use cached outputs here. We only want constants in IS_CHANGED - input_data_all, _, hidden_inputs = get_input_data(node["inputs"], class_def, node_id, None) + input_data_all, _, v3_data = get_input_data(node["inputs"], class_def, node_id, None) try: is_changed = await _async_map_node_over_list(self.prompt_id, node_id, class_def, input_data_all, is_changed_name) is_changed = await resolve_map_node_over_list_results(is_changed) @@ -146,8 +146,9 @@ SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org") def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=None, extra_data={}): is_v3 = issubclass(class_def, _ComfyNodeInternal) + v3_data: io.V3Data = {} if is_v3: - valid_inputs, schema = class_def.INPUT_TYPES(include_hidden=False, return_schema=True) + valid_inputs, schema, v3_data = class_def.INPUT_TYPES(include_hidden=False, return_schema=True, live_inputs=inputs) else: valid_inputs = class_def.INPUT_TYPES() input_data_all = {} @@ -207,7 +208,8 @@ def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt= input_data_all[x] = [extra_data.get("auth_token_comfy_org", None)] if h[x] == "API_KEY_COMFY_ORG": input_data_all[x] = [extra_data.get("api_key_comfy_org", None)] - return input_data_all, missing_keys, hidden_inputs_v3 + v3_data["hidden_inputs"] = hidden_inputs_v3 + return input_data_all, missing_keys, v3_data map_node_over_list = None #Don't hook this please @@ -223,7 +225,7 @@ async def resolve_map_node_over_list_results(results): raise exc return [x.result() if isinstance(x, asyncio.Task) else x for x in results] -async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None, hidden_inputs=None): +async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None, v3_data=None): # check if node wants the lists input_is_list = getattr(obj, "INPUT_IS_LIST", False) @@ -259,13 +261,16 @@ async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, f if is_class(obj): type_obj = obj obj.VALIDATE_CLASS() - class_clone = obj.PREPARE_CLASS_CLONE(hidden_inputs) + class_clone = obj.PREPARE_CLASS_CLONE(v3_data) # otherwise, use class instance to populate/reuse some fields else: type_obj = type(obj) type_obj.VALIDATE_CLASS() - class_clone = type_obj.PREPARE_CLASS_CLONE(hidden_inputs) + class_clone = type_obj.PREPARE_CLASS_CLONE(v3_data) f = make_locked_method_func(type_obj, func, class_clone) + # in case of dynamic inputs, restructure inputs to expected nested dict + if v3_data is not None: + inputs = _io.build_nested_inputs(inputs, v3_data) # V1 else: f = getattr(obj, func) @@ -320,8 +325,8 @@ def merge_result_data(results, obj): output.append([o[i] for o in results]) return output -async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None, hidden_inputs=None): - return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, hidden_inputs=hidden_inputs) +async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None, v3_data=None): + return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, v3_data=v3_data) has_pending_task = any(isinstance(r, asyncio.Task) and not r.done() for r in return_values) if has_pending_task: return return_values, {}, False, has_pending_task @@ -460,7 +465,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, has_subgraph = False else: get_progress_state().start_progress(unique_id) - input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data) + input_data_all, missing_keys, v3_data = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data) if server.client_id is not None: server.last_node_id = display_node_id server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id) @@ -475,7 +480,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, else: lazy_status_present = getattr(obj, "check_lazy_status", None) is not None if lazy_status_present: - required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True, hidden_inputs=hidden_inputs) + required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True, v3_data=v3_data) required_inputs = await resolve_map_node_over_list_results(required_inputs) required_inputs = set(sum([r for r in required_inputs if isinstance(r,list)], [])) required_inputs = [x for x in required_inputs if isinstance(x,str) and ( @@ -507,7 +512,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, def pre_execute_cb(call_index): # TODO - How to handle this with async functions without contextvars (which requires Python 3.12)? GraphBuilder.set_default_prefix(unique_id, call_index, 0) - output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, hidden_inputs=hidden_inputs) + output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, v3_data=v3_data) if has_pending_tasks: pending_async_nodes[unique_id] = output_data unblock = execution_list.add_external_block(unique_id) @@ -745,18 +750,17 @@ async def validate_inputs(prompt_id, prompt, item, validated): class_type = prompt[unique_id]['class_type'] obj_class = nodes.NODE_CLASS_MAPPINGS[class_type] - class_inputs = obj_class.INPUT_TYPES() - valid_inputs = set(class_inputs.get('required',{})).union(set(class_inputs.get('optional',{}))) - errors = [] valid = True validate_function_inputs = [] validate_has_kwargs = False if issubclass(obj_class, _ComfyNodeInternal): + class_inputs, _, _ = obj_class.INPUT_TYPES(include_hidden=False, return_schema=True, live_inputs=inputs) validate_function_name = "validate_inputs" validate_function = first_real_override(obj_class, validate_function_name) else: + class_inputs = obj_class.INPUT_TYPES() validate_function_name = "VALIDATE_INPUTS" validate_function = getattr(obj_class, validate_function_name, None) if validate_function is not None: @@ -765,6 +769,8 @@ async def validate_inputs(prompt_id, prompt, item, validated): validate_has_kwargs = argspec.varkw is not None received_types = {} + valid_inputs = set(class_inputs.get('required',{})).union(set(class_inputs.get('optional',{}))) + for x in valid_inputs: input_type, input_category, extra_info = get_input_info(obj_class, x, class_inputs) assert extra_info is not None @@ -935,7 +941,7 @@ async def validate_inputs(prompt_id, prompt, item, validated): continue if len(validate_function_inputs) > 0 or validate_has_kwargs: - input_data_all, _, hidden_inputs = get_input_data(inputs, obj_class, unique_id) + input_data_all, _, v3_data = get_input_data(inputs, obj_class, unique_id) input_filtered = {} for x in input_data_all: if x in validate_function_inputs or validate_has_kwargs: @@ -943,7 +949,7 @@ async def validate_inputs(prompt_id, prompt, item, validated): if 'input_types' in validate_function_inputs: input_filtered['input_types'] = [received_types] - ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, validate_function_name, hidden_inputs=hidden_inputs) + ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, validate_function_name, v3_data=v3_data) ret = await resolve_map_node_over_list_results(ret) for x in input_filtered: for i, r in enumerate(ret): diff --git a/nodes.py b/nodes.py index 4c910a34b..356aa63df 100644 --- a/nodes.py +++ b/nodes.py @@ -2355,6 +2355,7 @@ async def init_builtin_extra_nodes(): "nodes_easycache.py", "nodes_audio_encoder.py", "nodes_rope.py", + "nodes_logic.py", "nodes_nop.py", ] From 861817d22d2659099811b56005c9eaea18d64c73 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 2 Dec 2025 21:47:51 -0800 Subject: [PATCH 674/759] Fix issue with portable updater. (#11070) This should fix the problem with the portable updater not working with portables created from a separate branch on the repo. This does not affect any current portables who were all created on the master branch. --- .ci/update_windows/update.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.ci/update_windows/update.py b/.ci/update_windows/update.py index 51a263203..59ece5130 100755 --- a/.ci/update_windows/update.py +++ b/.ci/update_windows/update.py @@ -66,8 +66,10 @@ if branch is None: try: ref = repo.lookup_reference('refs/remotes/origin/master') except: - print("pulling.") # noqa: T201 - pull(repo) + print("fetching.") # noqa: T201 + for remote in repo.remotes: + if remote.name == "origin": + remote.fetch() ref = repo.lookup_reference('refs/remotes/origin/master') repo.checkout(ref) branch = repo.lookup_branch('master') @@ -149,3 +151,4 @@ try: shutil.copy(stable_update_script, stable_update_script_to) except: pass + From 519c9411653df99761053c30e101816e0ca3c24b Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Wed, 3 Dec 2025 17:28:45 +1000 Subject: [PATCH 675/759] Prs/lora reservations (reduce massive Lora reservations especially on Flux2) (#11069) * mp: only count the offload cost of math once This was previously bundling the combined weight storage and computation cost * ops: put all post async transfer compute on the main stream Some models have massive weights that need either complex dequantization or lora patching. Don't do these patchings on the offload stream, instead do them on the main stream to syncrhonize the potentially large vram spikes for these compute processes. This avoids having to assume a worst case scenario of multiple offload streams all spiking VRAM is parallel with whatever the main stream is doing. --- comfy/model_patcher.py | 4 ++-- comfy/ops.py | 39 ++++++++++++++++++++++----------------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 3eac77275..df2d8e827 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -704,7 +704,7 @@ class ModelPatcher: lowvram_weight = False - potential_offload = max(offload_buffer, module_offload_mem * (comfy.model_management.NUM_STREAMS + 1)) + potential_offload = max(offload_buffer, module_offload_mem + (comfy.model_management.NUM_STREAMS * module_mem)) lowvram_fits = mem_counter + module_mem + potential_offload < lowvram_model_memory weight_key = "{}.weight".format(n) @@ -883,7 +883,7 @@ class ModelPatcher: break module_offload_mem, module_mem, n, m, params = unload - potential_offload = (comfy.model_management.NUM_STREAMS + 1) * module_offload_mem + potential_offload = module_offload_mem + (comfy.model_management.NUM_STREAMS * module_mem) lowvram_possible = hasattr(m, "comfy_cast_weights") if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True: diff --git a/comfy/ops.py b/comfy/ops.py index 61a2f0754..eae434e68 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -111,22 +111,24 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of if s.bias is not None: bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=bias_has_function, stream=offload_stream) - if bias_has_function: - with wf_context: - for f in s.bias_function: - bias = f(bias) + comfy.model_management.sync_stream(device, offload_stream) + + bias_a = bias + weight_a = weight + + if s.bias is not None: + for f in s.bias_function: + bias = f(bias) if weight_has_function or weight.dtype != dtype: - with wf_context: - weight = weight.to(dtype=dtype) - if isinstance(weight, QuantizedTensor): - weight = weight.dequantize() - for f in s.weight_function: - weight = f(weight) + weight = weight.to(dtype=dtype) + if isinstance(weight, QuantizedTensor): + weight = weight.dequantize() + for f in s.weight_function: + weight = f(weight) - comfy.model_management.sync_stream(device, offload_stream) if offloadable: - return weight, bias, offload_stream + return weight, bias, (offload_stream, weight_a, bias_a) else: #Legacy function signature return weight, bias @@ -135,13 +137,16 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of def uncast_bias_weight(s, weight, bias, offload_stream): if offload_stream is None: return - if weight is not None: - device = weight.device + os, weight_a, bias_a = offload_stream + if os is None: + return + if weight_a is not None: + device = weight_a.device else: - if bias is None: + if bias_a is None: return - device = bias.device - offload_stream.wait_stream(comfy.model_management.current_stream(device)) + device = bias_a.device + os.wait_stream(comfy.model_management.current_stream(device)) class CastWeightBiasOp: From 19f2192d69d13445131b72ad1d87167f59b66fc4 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 3 Dec 2025 18:37:35 +0200 Subject: [PATCH 676/759] fix(V3-Schema): use empty list defaults for Schema.inputs/outputs/hidden to avoid None issues (#11083) --- comfy_api/latest/_io.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 257f07c42..866c3e0eb 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -5,7 +5,7 @@ import inspect from abc import ABC, abstractmethod from collections import Counter from collections.abc import Iterable -from dataclasses import asdict, dataclass +from dataclasses import asdict, dataclass, field from enum import Enum from typing import Any, Callable, Literal, TypedDict, TypeVar, TYPE_CHECKING from typing_extensions import NotRequired, final @@ -1199,9 +1199,9 @@ class Schema: """Display name of node.""" category: str = "sd" """The category of the node, as per the "Add Node" menu.""" - inputs: list[Input]=None - outputs: list[Output]=None - hidden: list[Hidden]=None + inputs: list[Input] = field(default_factory=list) + outputs: list[Output] = field(default_factory=list) + hidden: list[Hidden] = field(default_factory=list) description: str="" """Node description, shown as a tooltip when hovering over the node.""" is_input_list: bool = False From 87c104bfc1928f0b018a50f5867f425e10482929 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 3 Dec 2025 18:55:44 +0200 Subject: [PATCH 677/759] add support for "@image" reference format in Kling Omni API nodes (#11082) --- comfy_api_nodes/apis/kling_api.py | 30 +++++-- comfy_api_nodes/nodes_kling.py | 138 ++++++++++++++++++++++++++++-- 2 files changed, 155 insertions(+), 13 deletions(-) diff --git a/comfy_api_nodes/apis/kling_api.py b/comfy_api_nodes/apis/kling_api.py index 0a3b447c5..d8949f8ac 100644 --- a/comfy_api_nodes/apis/kling_api.py +++ b/comfy_api_nodes/apis/kling_api.py @@ -46,21 +46,41 @@ class TaskStatusVideoResult(BaseModel): url: str | None = Field(None, description="URL for generated video") -class TaskStatusVideoResults(BaseModel): +class TaskStatusImageResult(BaseModel): + index: int = Field(..., description="Image Number,0-9") + url: str = Field(..., description="URL for generated image") + + +class OmniTaskStatusResults(BaseModel): videos: list[TaskStatusVideoResult] | None = Field(None) + images: list[TaskStatusImageResult] | None = Field(None) -class TaskStatusVideoResponseData(BaseModel): +class OmniTaskStatusResponseData(BaseModel): created_at: int | None = Field(None, description="Task creation time") updated_at: int | None = Field(None, description="Task update time") task_status: str | None = None task_status_msg: str | None = Field(None, description="Additional failure reason. Only for polling endpoint.") task_id: str | None = Field(None, description="Task ID") - task_result: TaskStatusVideoResults | None = Field(None) + task_result: OmniTaskStatusResults | None = Field(None) -class TaskStatusVideoResponse(BaseModel): +class OmniTaskStatusResponse(BaseModel): code: int | None = Field(None, description="Error code") message: str | None = Field(None, description="Error message") request_id: str | None = Field(None, description="Request ID") - data: TaskStatusVideoResponseData | None = Field(None) + data: OmniTaskStatusResponseData | None = Field(None) + + +class OmniImageParamImage(BaseModel): + image: str = Field(...) + + +class OmniProImageRequest(BaseModel): + model_name: str = Field(..., description="kling-image-o1") + resolution: str = Field(..., description="'1k' or '2k'") + aspect_ratio: str | None = Field(...) + prompt: str = Field(...) + mode: str = Field("pro") + n: int | None = Field(1, le=9) + image_list: list[OmniImageParamImage] | None = Field(..., max_length=10) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 850c44db6..6c840dc47 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -6,6 +6,7 @@ For source of truth on the allowed permutations of request fields, please refere import logging import math +import re import torch from typing_extensions import override @@ -49,12 +50,14 @@ from comfy_api_nodes.apis import ( KlingSingleImageEffectModelName, ) from comfy_api_nodes.apis.kling_api import ( + OmniImageParamImage, OmniParamImage, OmniParamVideo, OmniProFirstLastFrameRequest, + OmniProImageRequest, OmniProReferences2VideoRequest, OmniProText2VideoRequest, - TaskStatusVideoResponse, + OmniTaskStatusResponse, ) from comfy_api_nodes.util import ( ApiEndpoint, @@ -210,7 +213,36 @@ VOICES_CONFIG = { } -async def finish_omni_video_task(cls: type[IO.ComfyNode], response: TaskStatusVideoResponse) -> IO.NodeOutput: +def normalize_omni_prompt_references(prompt: str) -> str: + """ + Rewrites Kling Omni-style placeholders used in the app, like: + + @image, @image1, @image2, ... @imageN + @video, @video1, @video2, ... @videoN + + into the API-compatible form: + + <<>>, <<>>, ... + <<>>, <<>>, ... + + This is a UX shim for ComfyUI so users can type the same syntax as in the Kling app. + """ + if not prompt: + return prompt + + def _image_repl(match): + return f"<<>>" + + def _video_repl(match): + return f"<<>>" + + # (? and not @imageFoo + prompt = re.sub(r"(?\d*)(?!\w)", _image_repl, prompt) + return re.sub(r"(?\d*)(?!\w)", _video_repl, prompt) + + +async def finish_omni_video_task(cls: type[IO.ComfyNode], response: OmniTaskStatusResponse) -> IO.NodeOutput: if response.code: raise RuntimeError( f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}" @@ -218,8 +250,9 @@ async def finish_omni_video_task(cls: type[IO.ComfyNode], response: TaskStatusVi final_response = await poll_op( cls, ApiEndpoint(path=f"/proxy/kling/v1/videos/omni-video/{response.data.task_id}"), - response_model=TaskStatusVideoResponse, + response_model=OmniTaskStatusResponse, status_extractor=lambda r: (r.data.task_status if r.data else None), + max_poll_attempts=160, ) return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url)) @@ -801,7 +834,7 @@ class OmniProTextToVideoNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=TaskStatusVideoResponse, + response_model=OmniTaskStatusResponse, data=OmniProText2VideoRequest( model_name=model_name, prompt=prompt, @@ -864,6 +897,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode): end_frame: Input.Image | None = None, reference_images: Input.Image | None = None, ) -> IO.NodeOutput: + prompt = normalize_omni_prompt_references(prompt) validate_string(prompt, min_length=1, max_length=2500) if end_frame is not None and reference_images is not None: raise ValueError("The 'end_frame' input cannot be used simultaneously with 'reference_images'.") @@ -895,7 +929,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=TaskStatusVideoResponse, + response_model=OmniTaskStatusResponse, data=OmniProFirstLastFrameRequest( model_name=model_name, prompt=prompt, @@ -950,6 +984,7 @@ class OmniProImageToVideoNode(IO.ComfyNode): duration: int, reference_images: Input.Image, ) -> IO.NodeOutput: + prompt = normalize_omni_prompt_references(prompt) validate_string(prompt, min_length=1, max_length=2500) if get_number_of_images(reference_images) > 7: raise ValueError("The maximum number of reference images is 7.") @@ -962,7 +997,7 @@ class OmniProImageToVideoNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=TaskStatusVideoResponse, + response_model=OmniTaskStatusResponse, data=OmniProReferences2VideoRequest( model_name=model_name, prompt=prompt, @@ -1023,6 +1058,7 @@ class OmniProVideoToVideoNode(IO.ComfyNode): keep_original_sound: bool, reference_images: Input.Image | None = None, ) -> IO.NodeOutput: + prompt = normalize_omni_prompt_references(prompt) validate_string(prompt, min_length=1, max_length=2500) validate_video_duration(reference_video, min_duration=3.0, max_duration=10.05) validate_video_dimensions(reference_video, min_width=720, min_height=720, max_width=2160, max_height=2160) @@ -1045,7 +1081,7 @@ class OmniProVideoToVideoNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=TaskStatusVideoResponse, + response_model=OmniTaskStatusResponse, data=OmniProReferences2VideoRequest( model_name=model_name, prompt=prompt, @@ -1103,6 +1139,7 @@ class OmniProEditVideoNode(IO.ComfyNode): keep_original_sound: bool, reference_images: Input.Image | None = None, ) -> IO.NodeOutput: + prompt = normalize_omni_prompt_references(prompt) validate_string(prompt, min_length=1, max_length=2500) validate_video_duration(video, min_duration=3.0, max_duration=10.05) validate_video_dimensions(video, min_width=720, min_height=720, max_width=2160, max_height=2160) @@ -1125,7 +1162,7 @@ class OmniProEditVideoNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=TaskStatusVideoResponse, + response_model=OmniTaskStatusResponse, data=OmniProReferences2VideoRequest( model_name=model_name, prompt=prompt, @@ -1138,6 +1175,90 @@ class OmniProEditVideoNode(IO.ComfyNode): return await finish_omni_video_task(cls, response) +class OmniProImageNode(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingOmniProImageNode", + display_name="Kling Omni Image (Pro)", + category="api node/image/Kling", + description="Create or edit images with the latest model from Kling.", + inputs=[ + IO.Combo.Input("model_name", options=["kling-image-o1"]), + IO.String.Input( + "prompt", + multiline=True, + tooltip="A text prompt describing the image content. " + "This can include both positive and negative descriptions.", + ), + IO.Combo.Input("resolution", options=["1K", "2K"]), + IO.Combo.Input( + "aspect_ratio", + options=["16:9", "9:16", "1:1", "4:3", "3:4", "3:2", "2:3", "21:9"], + ), + IO.Image.Input( + "reference_images", + tooltip="Up to 10 additional reference images.", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model_name: str, + prompt: str, + resolution: str, + aspect_ratio: str, + reference_images: Input.Image | None = None, + ) -> IO.NodeOutput: + prompt = normalize_omni_prompt_references(prompt) + validate_string(prompt, min_length=1, max_length=2500) + image_list: list[OmniImageParamImage] = [] + if reference_images is not None: + if get_number_of_images(reference_images) > 10: + raise ValueError("The maximum number of reference images is 10.") + for i in reference_images: + validate_image_dimensions(i, min_width=300, min_height=300) + validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1)) + for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"): + image_list.append(OmniImageParamImage(image=i)) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/kling/v1/images/omni-image", method="POST"), + response_model=OmniTaskStatusResponse, + data=OmniProImageRequest( + model_name=model_name, + prompt=prompt, + resolution=resolution.lower(), + aspect_ratio=aspect_ratio, + image_list=image_list if image_list else None, + ), + ) + if response.code: + raise RuntimeError( + f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}" + ) + final_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/kling/v1/images/omni-image/{response.data.task_id}"), + response_model=OmniTaskStatusResponse, + status_extractor=lambda r: (r.data.task_status if r.data else None), + ) + return IO.NodeOutput(await download_url_to_image_tensor(final_response.data.task_result.images[0].url)) + + class KlingCameraControlT2VNode(IO.ComfyNode): """ Kling Text to Video Camera Control Node. This node is a text to video node, but it supports controlling the camera. @@ -1935,6 +2056,7 @@ class KlingExtension(ComfyExtension): OmniProImageToVideoNode, OmniProVideoToVideoNode, OmniProEditVideoNode, + # OmniProImageNode, # need support from backend ] From 440268d3940eb14a01595439bbc05c4aacde9c72 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 3 Dec 2025 23:52:31 +0200 Subject: [PATCH 678/759] convert nodes_load_3d.py to V3 schema (#10990) --- comfy_api/latest/_ui.py | 13 +++- comfy_extras/nodes_load_3d.py | 127 ++++++++++++++++------------------ 2 files changed, 71 insertions(+), 69 deletions(-) diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py index b0bbabe2a..6d1bea599 100644 --- a/comfy_api/latest/_ui.py +++ b/comfy_api/latest/_ui.py @@ -3,6 +3,7 @@ from __future__ import annotations import json import os import random +import uuid from io import BytesIO from typing import Type @@ -436,9 +437,19 @@ class PreviewUI3D(_UIOutput): def __init__(self, model_file, camera_info, **kwargs): self.model_file = model_file self.camera_info = camera_info + self.bg_image_path = None + bg_image = kwargs.get("bg_image", None) + if bg_image is not None: + img_array = (bg_image[0].cpu().numpy() * 255).astype(np.uint8) + img = PILImage.fromarray(img_array) + temp_dir = folder_paths.get_temp_directory() + filename = f"bg_{uuid.uuid4().hex}.png" + bg_image_path = os.path.join(temp_dir, filename) + img.save(bg_image_path, compress_level=1) + self.bg_image_path = f"temp/{filename}" def as_dict(self): - return {"result": [self.model_file, self.camera_info]} + return {"result": [self.model_file, self.camera_info, self.bg_image_path]} class PreviewText(_UIOutput): diff --git a/comfy_extras/nodes_load_3d.py b/comfy_extras/nodes_load_3d.py index 54c66ef68..545588ef8 100644 --- a/comfy_extras/nodes_load_3d.py +++ b/comfy_extras/nodes_load_3d.py @@ -2,22 +2,18 @@ import nodes import folder_paths import os -from comfy.comfy_types import IO -from comfy_api.input_impl import VideoFromFile +from typing_extensions import override +from comfy_api.latest import IO, ComfyExtension, InputImpl, UI from pathlib import Path -from PIL import Image -import numpy as np - -import uuid def normalize_path(path): return path.replace('\\', '/') -class Load3D(): +class Load3D(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): + def define_schema(cls): input_dir = os.path.join(folder_paths.get_input_directory(), "3d") os.makedirs(input_dir, exist_ok=True) @@ -30,23 +26,29 @@ class Load3D(): for file_path in input_path.rglob("*") if file_path.suffix.lower() in {'.gltf', '.glb', '.obj', '.fbx', '.stl'} ] + return IO.Schema( + node_id="Load3D", + display_name="Load 3D & Animation", + category="3d", + is_experimental=True, + inputs=[ + IO.Combo.Input("model_file", options=sorted(files), upload=IO.UploadType.model), + IO.Load3D.Input("image"), + IO.Int.Input("width", default=1024, min=1, max=4096, step=1), + IO.Int.Input("height", default=1024, min=1, max=4096, step=1), + ], + outputs=[ + IO.Image.Output(display_name="image"), + IO.Mask.Output(display_name="mask"), + IO.String.Output(display_name="mesh_path"), + IO.Image.Output(display_name="normal"), + IO.Load3DCamera.Output(display_name="camera_info"), + IO.Video.Output(display_name="recording_video"), + ], + ) - return {"required": { - "model_file": (sorted(files), {"file_upload": True}), - "image": ("LOAD_3D", {}), - "width": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}), - "height": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}), - }} - - RETURN_TYPES = ("IMAGE", "MASK", "STRING", "IMAGE", "LOAD3D_CAMERA", IO.VIDEO) - RETURN_NAMES = ("image", "mask", "mesh_path", "normal", "camera_info", "recording_video") - - FUNCTION = "process" - EXPERIMENTAL = True - - CATEGORY = "3d" - - def process(self, model_file, image, **kwargs): + @classmethod + def execute(cls, model_file, image, **kwargs) -> IO.NodeOutput: image_path = folder_paths.get_annotated_filepath(image['image']) mask_path = folder_paths.get_annotated_filepath(image['mask']) normal_path = folder_paths.get_annotated_filepath(image['normal']) @@ -61,58 +63,47 @@ class Load3D(): if image['recording'] != "": recording_video_path = folder_paths.get_annotated_filepath(image['recording']) - video = VideoFromFile(recording_video_path) + video = InputImpl.VideoFromFile(recording_video_path) - return output_image, output_mask, model_file, normal_image, image['camera_info'], video + return IO.NodeOutput(output_image, output_mask, model_file, normal_image, image['camera_info'], video) -class Preview3D(): + process = execute # TODO: remove + + +class Preview3D(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "model_file": ("STRING", {"default": "", "multiline": False}), - }, - "optional": { - "camera_info": ("LOAD3D_CAMERA", {}), - "bg_image": ("IMAGE", {}) - }} + def define_schema(cls): + return IO.Schema( + node_id="Preview3D", + display_name="Preview 3D & Animation", + category="3d", + is_experimental=True, + is_output_node=True, + inputs=[ + IO.String.Input("model_file", default="", multiline=False), + IO.Load3DCamera.Input("camera_info", optional=True), + IO.Image.Input("bg_image", optional=True), + ], + outputs=[], + ) - OUTPUT_NODE = True - RETURN_TYPES = () - - CATEGORY = "3d" - - FUNCTION = "process" - EXPERIMENTAL = True - - def process(self, model_file, **kwargs): + @classmethod + def execute(cls, model_file, **kwargs) -> IO.NodeOutput: camera_info = kwargs.get("camera_info", None) bg_image = kwargs.get("bg_image", None) + return IO.NodeOutput(ui=UI.PreviewUI3D(model_file, camera_info, bg_image=bg_image)) - bg_image_path = None - if bg_image is not None: + process = execute # TODO: remove - img_array = (bg_image[0].cpu().numpy() * 255).astype(np.uint8) - img = Image.fromarray(img_array) - temp_dir = folder_paths.get_temp_directory() - filename = f"bg_{uuid.uuid4().hex}.png" - bg_image_path = os.path.join(temp_dir, filename) - img.save(bg_image_path, compress_level=1) +class Load3DExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + Load3D, + Preview3D, + ] - bg_image_path = f"temp/{filename}" - return { - "ui": { - "result": [model_file, camera_info, bg_image_path] - } - } - -NODE_CLASS_MAPPINGS = { - "Load3D": Load3D, - "Preview3D": Preview3D, -} - -NODE_DISPLAY_NAME_MAPPINGS = { - "Load3D": "Load 3D & Animation", - "Preview3D": "Preview 3D & Animation", -} +async def comfy_entrypoint() -> Load3DExtension: + return Load3DExtension() From dce518c2b4f99634b5fdde1924d9b0bd468fe1ce Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 4 Dec 2025 03:35:04 +0200 Subject: [PATCH 679/759] convert nodes_audio.py to V3 schema (#10798) --- comfy_api/latest/_ui.py | 9 +- comfy_extras/nodes_audio.py | 744 ++++++++++++++++++------------------ 2 files changed, 382 insertions(+), 371 deletions(-) diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py index 6d1bea599..5a75a3aae 100644 --- a/comfy_api/latest/_ui.py +++ b/comfy_api/latest/_ui.py @@ -319,9 +319,10 @@ class AudioSaveHelper: for key, value in metadata.items(): output_container.metadata[key] = value + layout = "mono" if waveform.shape[0] == 1 else "stereo" # Set up the output stream with appropriate properties if format == "opus": - out_stream = output_container.add_stream("libopus", rate=sample_rate) + out_stream = output_container.add_stream("libopus", rate=sample_rate, layout=layout) if quality == "64k": out_stream.bit_rate = 64000 elif quality == "96k": @@ -333,7 +334,7 @@ class AudioSaveHelper: elif quality == "320k": out_stream.bit_rate = 320000 elif format == "mp3": - out_stream = output_container.add_stream("libmp3lame", rate=sample_rate) + out_stream = output_container.add_stream("libmp3lame", rate=sample_rate, layout=layout) if quality == "V0": # TODO i would really love to support V3 and V5 but there doesn't seem to be a way to set the qscale level, the property below is a bool out_stream.codec_context.qscale = 1 @@ -342,12 +343,12 @@ class AudioSaveHelper: elif quality == "320k": out_stream.bit_rate = 320000 else: # format == "flac": - out_stream = output_container.add_stream("flac", rate=sample_rate) + out_stream = output_container.add_stream("flac", rate=sample_rate, layout=layout) frame = av.AudioFrame.from_ndarray( waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format="flt", - layout="mono" if waveform.shape[0] == 1 else "stereo", + layout=layout, ) frame.sample_rate = sample_rate frame.pts = 0 diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 2ed7e0b22..812301fb7 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -6,65 +6,80 @@ import torch import comfy.model_management import folder_paths import os -import io -import json -import random import hashlib import node_helpers import logging -from comfy.cli_args import args -from comfy.comfy_types import FileLocator +from typing_extensions import override +from comfy_api.latest import ComfyExtension, IO, UI -class EmptyLatentAudio: - def __init__(self): - self.device = comfy.model_management.intermediate_device() +class EmptyLatentAudio(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="EmptyLatentAudio", + display_name="Empty Latent Audio", + category="latent/audio", + inputs=[ + IO.Float.Input("seconds", default=47.6, min=1.0, max=1000.0, step=0.1), + IO.Int.Input( + "batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch." + ), + ], + outputs=[IO.Latent.Output()], + ) @classmethod - def INPUT_TYPES(s): - return {"required": {"seconds": ("FLOAT", {"default": 47.6, "min": 1.0, "max": 1000.0, "step": 0.1}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent images in the batch."}), - }} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" - - CATEGORY = "latent/audio" - - def generate(self, seconds, batch_size): + def execute(cls, seconds, batch_size) -> IO.NodeOutput: length = round((seconds * 44100 / 2048) / 2) * 2 - latent = torch.zeros([batch_size, 64, length], device=self.device) - return ({"samples":latent, "type": "audio"}, ) + latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device()) + return IO.NodeOutput({"samples":latent, "type": "audio"}) -class ConditioningStableAudio: + generate = execute # TODO: remove + + +class ConditioningStableAudio(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "seconds_start": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1000.0, "step": 0.1}), - "seconds_total": ("FLOAT", {"default": 47.0, "min": 0.0, "max": 1000.0, "step": 0.1}), - }} + def define_schema(cls): + return IO.Schema( + node_id="ConditioningStableAudio", + category="conditioning", + inputs=[ + IO.Conditioning.Input("positive"), + IO.Conditioning.Input("negative"), + IO.Float.Input("seconds_start", default=0.0, min=0.0, max=1000.0, step=0.1), + IO.Float.Input("seconds_total", default=47.0, min=0.0, max=1000.0, step=0.1), + ], + outputs=[ + IO.Conditioning.Output(display_name="positive"), + IO.Conditioning.Output(display_name="negative"), + ], + ) - RETURN_TYPES = ("CONDITIONING","CONDITIONING") - RETURN_NAMES = ("positive", "negative") - - FUNCTION = "append" - - CATEGORY = "conditioning" - - def append(self, positive, negative, seconds_start, seconds_total): + @classmethod + def execute(cls, positive, negative, seconds_start, seconds_total) -> IO.NodeOutput: positive = node_helpers.conditioning_set_values(positive, {"seconds_start": seconds_start, "seconds_total": seconds_total}) negative = node_helpers.conditioning_set_values(negative, {"seconds_start": seconds_start, "seconds_total": seconds_total}) - return (positive, negative) + return IO.NodeOutput(positive, negative) -class VAEEncodeAudio: + append = execute # TODO: remove + + +class VAEEncodeAudio(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "audio": ("AUDIO", ), "vae": ("VAE", )}} - RETURN_TYPES = ("LATENT",) - FUNCTION = "encode" + def define_schema(cls): + return IO.Schema( + node_id="VAEEncodeAudio", + display_name="VAE Encode Audio", + category="latent/audio", + inputs=[ + IO.Audio.Input("audio"), + IO.Vae.Input("vae"), + ], + outputs=[IO.Latent.Output()], + ) - CATEGORY = "latent/audio" - - def encode(self, vae, audio): + @classmethod + def execute(cls, vae, audio) -> IO.NodeOutput: sample_rate = audio["sample_rate"] if 44100 != sample_rate: waveform = torchaudio.functional.resample(audio["waveform"], sample_rate, 44100) @@ -72,213 +87,134 @@ class VAEEncodeAudio: waveform = audio["waveform"] t = vae.encode(waveform.movedim(1, -1)) - return ({"samples":t}, ) + return IO.NodeOutput({"samples":t}) -class VAEDecodeAudio: + encode = execute # TODO: remove + + +class VAEDecodeAudio(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples": ("LATENT", ), "vae": ("VAE", )}} - RETURN_TYPES = ("AUDIO",) - FUNCTION = "decode" + def define_schema(cls): + return IO.Schema( + node_id="VAEDecodeAudio", + display_name="VAE Decode Audio", + category="latent/audio", + inputs=[ + IO.Latent.Input("samples"), + IO.Vae.Input("vae"), + ], + outputs=[IO.Audio.Output()], + ) - CATEGORY = "latent/audio" - - def decode(self, vae, samples): + @classmethod + def execute(cls, vae, samples) -> IO.NodeOutput: audio = vae.decode(samples["samples"]).movedim(-1, 1) std = torch.std(audio, dim=[1,2], keepdim=True) * 5.0 std[std < 1.0] = 1.0 audio /= std - return ({"waveform": audio, "sample_rate": 44100}, ) + return IO.NodeOutput({"waveform": audio, "sample_rate": 44100}) + + decode = execute # TODO: remove -def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=None, extra_pnginfo=None, quality="128k"): - - filename_prefix += self.prefix_append - full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir) - results: list[FileLocator] = [] - - # Prepare metadata dictionary - metadata = {} - if not args.disable_metadata: - if prompt is not None: - metadata["prompt"] = json.dumps(prompt) - if extra_pnginfo is not None: - for x in extra_pnginfo: - metadata[x] = json.dumps(extra_pnginfo[x]) - - # Opus supported sample rates - OPUS_RATES = [8000, 12000, 16000, 24000, 48000] - - for (batch_number, waveform) in enumerate(audio["waveform"].cpu()): - filename_with_batch_num = filename.replace("%batch_num%", str(batch_number)) - file = f"{filename_with_batch_num}_{counter:05}_.{format}" - output_path = os.path.join(full_output_folder, file) - - # Use original sample rate initially - sample_rate = audio["sample_rate"] - - # Handle Opus sample rate requirements - if format == "opus": - if sample_rate > 48000: - sample_rate = 48000 - elif sample_rate not in OPUS_RATES: - # Find the next highest supported rate - for rate in sorted(OPUS_RATES): - if rate > sample_rate: - sample_rate = rate - break - if sample_rate not in OPUS_RATES: # Fallback if still not supported - sample_rate = 48000 - - # Resample if necessary - if sample_rate != audio["sample_rate"]: - waveform = torchaudio.functional.resample(waveform, audio["sample_rate"], sample_rate) - - # Create output with specified format - output_buffer = io.BytesIO() - output_container = av.open(output_buffer, mode='w', format=format) - - # Set metadata on the container - for key, value in metadata.items(): - output_container.metadata[key] = value - - layout = 'mono' if waveform.shape[0] == 1 else 'stereo' - # Set up the output stream with appropriate properties - if format == "opus": - out_stream = output_container.add_stream("libopus", rate=sample_rate, layout=layout) - if quality == "64k": - out_stream.bit_rate = 64000 - elif quality == "96k": - out_stream.bit_rate = 96000 - elif quality == "128k": - out_stream.bit_rate = 128000 - elif quality == "192k": - out_stream.bit_rate = 192000 - elif quality == "320k": - out_stream.bit_rate = 320000 - elif format == "mp3": - out_stream = output_container.add_stream("libmp3lame", rate=sample_rate, layout=layout) - if quality == "V0": - #TODO i would really love to support V3 and V5 but there doesn't seem to be a way to set the qscale level, the property below is a bool - out_stream.codec_context.qscale = 1 - elif quality == "128k": - out_stream.bit_rate = 128000 - elif quality == "320k": - out_stream.bit_rate = 320000 - else: #format == "flac": - out_stream = output_container.add_stream("flac", rate=sample_rate, layout=layout) - - frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', layout=layout) - frame.sample_rate = sample_rate - frame.pts = 0 - output_container.mux(out_stream.encode(frame)) - - # Flush encoder - output_container.mux(out_stream.encode(None)) - - # Close containers - output_container.close() - - # Write the output to file - output_buffer.seek(0) - with open(output_path, 'wb') as f: - f.write(output_buffer.getbuffer()) - - results.append({ - "filename": file, - "subfolder": subfolder, - "type": self.type - }) - counter += 1 - - return { "ui": { "audio": results } } - -class SaveAudio: - def __init__(self): - self.output_dir = folder_paths.get_output_directory() - self.type = "output" - self.prefix_append = "" +class SaveAudio(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="SaveAudio", + display_name="Save Audio (FLAC)", + category="audio", + inputs=[ + IO.Audio.Input("audio"), + IO.String.Input("filename_prefix", default="audio/ComfyUI"), + ], + hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo], + is_output_node=True, + ) @classmethod - def INPUT_TYPES(s): - return {"required": { "audio": ("AUDIO", ), - "filename_prefix": ("STRING", {"default": "audio/ComfyUI"}), - }, - "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, - } + def execute(cls, audio, filename_prefix="ComfyUI", format="flac") -> IO.NodeOutput: + return IO.NodeOutput( + ui=UI.AudioSaveHelper.get_save_audio_ui(audio, filename_prefix=filename_prefix, cls=cls, format=format) + ) - RETURN_TYPES = () - FUNCTION = "save_flac" + save_flac = execute # TODO: remove - OUTPUT_NODE = True - CATEGORY = "audio" - - def save_flac(self, audio, filename_prefix="ComfyUI", format="flac", prompt=None, extra_pnginfo=None): - return save_audio(self, audio, filename_prefix, format, prompt, extra_pnginfo) - -class SaveAudioMP3: - def __init__(self): - self.output_dir = folder_paths.get_output_directory() - self.type = "output" - self.prefix_append = "" +class SaveAudioMP3(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="SaveAudioMP3", + display_name="Save Audio (MP3)", + category="audio", + inputs=[ + IO.Audio.Input("audio"), + IO.String.Input("filename_prefix", default="audio/ComfyUI"), + IO.Combo.Input("quality", options=["V0", "128k", "320k"], default="V0"), + ], + hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo], + is_output_node=True, + ) @classmethod - def INPUT_TYPES(s): - return {"required": { "audio": ("AUDIO", ), - "filename_prefix": ("STRING", {"default": "audio/ComfyUI"}), - "quality": (["V0", "128k", "320k"], {"default": "V0"}), - }, - "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, - } + def execute(cls, audio, filename_prefix="ComfyUI", format="mp3", quality="128k") -> IO.NodeOutput: + return IO.NodeOutput( + ui=UI.AudioSaveHelper.get_save_audio_ui( + audio, filename_prefix=filename_prefix, cls=cls, format=format, quality=quality + ) + ) - RETURN_TYPES = () - FUNCTION = "save_mp3" + save_mp3 = execute # TODO: remove - OUTPUT_NODE = True - CATEGORY = "audio" - - def save_mp3(self, audio, filename_prefix="ComfyUI", format="mp3", prompt=None, extra_pnginfo=None, quality="128k"): - return save_audio(self, audio, filename_prefix, format, prompt, extra_pnginfo, quality) - -class SaveAudioOpus: - def __init__(self): - self.output_dir = folder_paths.get_output_directory() - self.type = "output" - self.prefix_append = "" +class SaveAudioOpus(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="SaveAudioOpus", + display_name="Save Audio (Opus)", + category="audio", + inputs=[ + IO.Audio.Input("audio"), + IO.String.Input("filename_prefix", default="audio/ComfyUI"), + IO.Combo.Input("quality", options=["64k", "96k", "128k", "192k", "320k"], default="128k"), + ], + hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo], + is_output_node=True, + ) @classmethod - def INPUT_TYPES(s): - return {"required": { "audio": ("AUDIO", ), - "filename_prefix": ("STRING", {"default": "audio/ComfyUI"}), - "quality": (["64k", "96k", "128k", "192k", "320k"], {"default": "128k"}), - }, - "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, - } + def execute(cls, audio, filename_prefix="ComfyUI", format="opus", quality="V3") -> IO.NodeOutput: + return IO.NodeOutput( + ui=UI.AudioSaveHelper.get_save_audio_ui( + audio, filename_prefix=filename_prefix, cls=cls, format=format, quality=quality + ) + ) - RETURN_TYPES = () - FUNCTION = "save_opus" + save_opus = execute # TODO: remove - OUTPUT_NODE = True - CATEGORY = "audio" - - def save_opus(self, audio, filename_prefix="ComfyUI", format="opus", prompt=None, extra_pnginfo=None, quality="V3"): - return save_audio(self, audio, filename_prefix, format, prompt, extra_pnginfo, quality) - -class PreviewAudio(SaveAudio): - def __init__(self): - self.output_dir = folder_paths.get_temp_directory() - self.type = "temp" - self.prefix_append = "_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for x in range(5)) +class PreviewAudio(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="PreviewAudio", + display_name="Preview Audio", + category="audio", + inputs=[ + IO.Audio.Input("audio"), + ], + hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo], + is_output_node=True, + ) @classmethod - def INPUT_TYPES(s): - return {"required": - {"audio": ("AUDIO", ), }, - "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, - } + def execute(cls, audio) -> IO.NodeOutput: + return IO.NodeOutput(ui=UI.PreviewAudio(audio, cls=cls)) + + save_flac = execute # TODO: remove + def f32_pcm(wav: torch.Tensor) -> torch.Tensor: """Convert audio to float 32 bits PCM format.""" @@ -316,26 +252,30 @@ def load(filepath: str) -> tuple[torch.Tensor, int]: wav = f32_pcm(wav) return wav, sr -class LoadAudio: +class LoadAudio(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): + def define_schema(cls): input_dir = folder_paths.get_input_directory() files = folder_paths.filter_files_content_types(os.listdir(input_dir), ["audio", "video"]) - return {"required": {"audio": (sorted(files), {"audio_upload": True})}} + return IO.Schema( + node_id="LoadAudio", + display_name="Load Audio", + category="audio", + inputs=[ + IO.Combo.Input("audio", upload=IO.UploadType.audio, options=sorted(files)), + ], + outputs=[IO.Audio.Output()], + ) - CATEGORY = "audio" - - RETURN_TYPES = ("AUDIO", ) - FUNCTION = "load" - - def load(self, audio): + @classmethod + def execute(cls, audio) -> IO.NodeOutput: audio_path = folder_paths.get_annotated_filepath(audio) waveform, sample_rate = load(audio_path) audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate} - return (audio, ) + return IO.NodeOutput(audio) @classmethod - def IS_CHANGED(s, audio): + def fingerprint_inputs(cls, audio): image_path = folder_paths.get_annotated_filepath(audio) m = hashlib.sha256() with open(image_path, 'rb') as f: @@ -343,46 +283,69 @@ class LoadAudio: return m.digest().hex() @classmethod - def VALIDATE_INPUTS(s, audio): + def validate_inputs(cls, audio): if not folder_paths.exists_annotated_filepath(audio): return "Invalid audio file: {}".format(audio) return True -class RecordAudio: + load = execute # TODO: remove + + +class RecordAudio(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"audio": ("AUDIO_RECORD", {})}} + def define_schema(cls): + return IO.Schema( + node_id="RecordAudio", + display_name="Record Audio", + category="audio", + inputs=[ + IO.Custom("AUDIO_RECORD").Input("audio"), + ], + outputs=[IO.Audio.Output()], + ) - CATEGORY = "audio" - - RETURN_TYPES = ("AUDIO", ) - FUNCTION = "load" - - def load(self, audio): + @classmethod + def execute(cls, audio) -> IO.NodeOutput: audio_path = folder_paths.get_annotated_filepath(audio) waveform, sample_rate = load(audio_path) audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate} - return (audio, ) + return IO.NodeOutput(audio) + + load = execute # TODO: remove -class TrimAudioDuration: +class TrimAudioDuration(IO.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "audio": ("AUDIO",), - "start_index": ("FLOAT", {"default": 0.0, "min": -0xffffffffffffffff, "max": 0xffffffffffffffff, "step": 0.01, "tooltip": "Start time in seconds, can be negative to count from the end (supports sub-seconds)."}), - "duration": ("FLOAT", {"default": 60.0, "min": 0.0, "step": 0.01, "tooltip": "Duration in seconds"}), - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TrimAudioDuration", + display_name="Trim Audio Duration", + description="Trim audio tensor into chosen time range.", + category="audio", + inputs=[ + IO.Audio.Input("audio"), + IO.Float.Input( + "start_index", + default=0.0, + min=-0xffffffffffffffff, + max=0xffffffffffffffff, + step=0.01, + tooltip="Start time in seconds, can be negative to count from the end (supports sub-seconds).", + ), + IO.Float.Input( + "duration", + default=60.0, + min=0.0, + step=0.01, + tooltip="Duration in seconds", + ), + ], + outputs=[IO.Audio.Output()], + ) - FUNCTION = "trim" - RETURN_TYPES = ("AUDIO",) - CATEGORY = "audio" - DESCRIPTION = "Trim audio tensor into chosen time range." - - def trim(self, audio, start_index, duration): + @classmethod + def execute(cls, audio, start_index, duration) -> IO.NodeOutput: waveform = audio["waveform"] sample_rate = audio["sample_rate"] audio_length = waveform.shape[-1] @@ -399,23 +362,30 @@ class TrimAudioDuration: if start_frame >= end_frame: raise ValueError("AudioTrim: Start time must be less than end time and be within the audio length.") - return ({"waveform": waveform[..., start_frame:end_frame], "sample_rate": sample_rate},) + return IO.NodeOutput({"waveform": waveform[..., start_frame:end_frame], "sample_rate": sample_rate}) + + trim = execute # TODO: remove -class SplitAudioChannels: +class SplitAudioChannels(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "audio": ("AUDIO",), - }} + def define_schema(cls): + return IO.Schema( + node_id="SplitAudioChannels", + display_name="Split Audio Channels", + description="Separates the audio into left and right channels.", + category="audio", + inputs=[ + IO.Audio.Input("audio"), + ], + outputs=[ + IO.Audio.Output(display_name="left"), + IO.Audio.Output(display_name="right"), + ], + ) - RETURN_TYPES = ("AUDIO", "AUDIO") - RETURN_NAMES = ("left", "right") - FUNCTION = "separate" - CATEGORY = "audio" - DESCRIPTION = "Separates the audio into left and right channels." - - def separate(self, audio): + @classmethod + def execute(cls, audio) -> IO.NodeOutput: waveform = audio["waveform"] sample_rate = audio["sample_rate"] @@ -425,7 +395,9 @@ class SplitAudioChannels: left_channel = waveform[..., 0:1, :] right_channel = waveform[..., 1:2, :] - return ({"waveform": left_channel, "sample_rate": sample_rate}, {"waveform": right_channel, "sample_rate": sample_rate}) + return IO.NodeOutput({"waveform": left_channel, "sample_rate": sample_rate}, {"waveform": right_channel, "sample_rate": sample_rate}) + + separate = execute # TODO: remove def match_audio_sample_rates(waveform_1, sample_rate_1, waveform_2, sample_rate_2): @@ -443,21 +415,29 @@ def match_audio_sample_rates(waveform_1, sample_rate_1, waveform_2, sample_rate_ return waveform_1, waveform_2, output_sample_rate -class AudioConcat: +class AudioConcat(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "audio1": ("AUDIO",), - "audio2": ("AUDIO",), - "direction": (['after', 'before'], {"default": 'after', "tooltip": "Whether to append audio2 after or before audio1."}), - }} + def define_schema(cls): + return IO.Schema( + node_id="AudioConcat", + display_name="Audio Concat", + description="Concatenates the audio1 to audio2 in the specified direction.", + category="audio", + inputs=[ + IO.Audio.Input("audio1"), + IO.Audio.Input("audio2"), + IO.Combo.Input( + "direction", + options=['after', 'before'], + default="after", + tooltip="Whether to append audio2 after or before audio1.", + ) + ], + outputs=[IO.Audio.Output()], + ) - RETURN_TYPES = ("AUDIO",) - FUNCTION = "concat" - CATEGORY = "audio" - DESCRIPTION = "Concatenates the audio1 to audio2 in the specified direction." - - def concat(self, audio1, audio2, direction): + @classmethod + def execute(cls, audio1, audio2, direction) -> IO.NodeOutput: waveform_1 = audio1["waveform"] waveform_2 = audio2["waveform"] sample_rate_1 = audio1["sample_rate"] @@ -477,26 +457,33 @@ class AudioConcat: elif direction == 'before': concatenated_audio = torch.cat((waveform_2, waveform_1), dim=2) - return ({"waveform": concatenated_audio, "sample_rate": output_sample_rate},) + return IO.NodeOutput({"waveform": concatenated_audio, "sample_rate": output_sample_rate}) + + concat = execute # TODO: remove -class AudioMerge: +class AudioMerge(IO.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "audio1": ("AUDIO",), - "audio2": ("AUDIO",), - "merge_method": (["add", "mean", "subtract", "multiply"], {"tooltip": "The method used to combine the audio waveforms."}), - }, - } + def define_schema(cls): + return IO.Schema( + node_id="AudioMerge", + display_name="Audio Merge", + description="Combine two audio tracks by overlaying their waveforms.", + category="audio", + inputs=[ + IO.Audio.Input("audio1"), + IO.Audio.Input("audio2"), + IO.Combo.Input( + "merge_method", + options=["add", "mean", "subtract", "multiply"], + tooltip="The method used to combine the audio waveforms.", + ) + ], + outputs=[IO.Audio.Output()], + ) - FUNCTION = "merge" - RETURN_TYPES = ("AUDIO",) - CATEGORY = "audio" - DESCRIPTION = "Combine two audio tracks by overlaying their waveforms." - - def merge(self, audio1, audio2, merge_method): + @classmethod + def execute(cls, audio1, audio2, merge_method) -> IO.NodeOutput: waveform_1 = audio1["waveform"] waveform_2 = audio2["waveform"] sample_rate_1 = audio1["sample_rate"] @@ -530,85 +517,108 @@ class AudioMerge: if max_val > 1.0: waveform = waveform / max_val - return ({"waveform": waveform, "sample_rate": output_sample_rate},) + return IO.NodeOutput({"waveform": waveform, "sample_rate": output_sample_rate}) + + merge = execute # TODO: remove -class AudioAdjustVolume: +class AudioAdjustVolume(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "audio": ("AUDIO",), - "volume": ("INT", {"default": 1.0, "min": -100, "max": 100, "tooltip": "Volume adjustment in decibels (dB). 0 = no change, +6 = double, -6 = half, etc"}), - }} + def define_schema(cls): + return IO.Schema( + node_id="AudioAdjustVolume", + display_name="Audio Adjust Volume", + category="audio", + inputs=[ + IO.Audio.Input("audio"), + IO.Int.Input( + "volume", + default=1, + min=-100, + max=100, + tooltip="Volume adjustment in decibels (dB). 0 = no change, +6 = double, -6 = half, etc", + ) + ], + outputs=[IO.Audio.Output()], + ) - RETURN_TYPES = ("AUDIO",) - FUNCTION = "adjust_volume" - CATEGORY = "audio" - - def adjust_volume(self, audio, volume): + @classmethod + def execute(cls, audio, volume) -> IO.NodeOutput: if volume == 0: - return (audio,) + return IO.NodeOutput(audio) waveform = audio["waveform"] sample_rate = audio["sample_rate"] gain = 10 ** (volume / 20) waveform = waveform * gain - return ({"waveform": waveform, "sample_rate": sample_rate},) + return IO.NodeOutput({"waveform": waveform, "sample_rate": sample_rate}) + + adjust_volume = execute # TODO: remove -class EmptyAudio: +class EmptyAudio(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "duration": ("FLOAT", {"default": 60.0, "min": 0.0, "max": 0xffffffffffffffff, "step": 0.01, "tooltip": "Duration of the empty audio clip in seconds"}), - "sample_rate": ("INT", {"default": 44100, "tooltip": "Sample rate of the empty audio clip."}), - "channels": ("INT", {"default": 2, "min": 1, "max": 2, "tooltip": "Number of audio channels (1 for mono, 2 for stereo)."}), - }} + def define_schema(cls): + return IO.Schema( + node_id="EmptyAudio", + display_name="Empty Audio", + category="audio", + inputs=[ + IO.Float.Input( + "duration", + default=60.0, + min=0.0, + max=0xffffffffffffffff, + step=0.01, + tooltip="Duration of the empty audio clip in seconds", + ), + IO.Float.Input( + "sample_rate", + default=44100, + tooltip="Sample rate of the empty audio clip.", + ), + IO.Float.Input( + "channels", + default=2, + min=1, + max=2, + tooltip="Number of audio channels (1 for mono, 2 for stereo).", + ), + ], + outputs=[IO.Audio.Output()], + ) - RETURN_TYPES = ("AUDIO",) - FUNCTION = "create_empty_audio" - CATEGORY = "audio" - - def create_empty_audio(self, duration, sample_rate, channels): + @classmethod + def execute(cls, duration, sample_rate, channels) -> IO.NodeOutput: num_samples = int(round(duration * sample_rate)) waveform = torch.zeros((1, channels, num_samples), dtype=torch.float32) - return ({"waveform": waveform, "sample_rate": sample_rate},) + return IO.NodeOutput({"waveform": waveform, "sample_rate": sample_rate}) + + create_empty_audio = execute # TODO: remove -NODE_CLASS_MAPPINGS = { - "EmptyLatentAudio": EmptyLatentAudio, - "VAEEncodeAudio": VAEEncodeAudio, - "VAEDecodeAudio": VAEDecodeAudio, - "SaveAudio": SaveAudio, - "SaveAudioMP3": SaveAudioMP3, - "SaveAudioOpus": SaveAudioOpus, - "LoadAudio": LoadAudio, - "PreviewAudio": PreviewAudio, - "ConditioningStableAudio": ConditioningStableAudio, - "RecordAudio": RecordAudio, - "TrimAudioDuration": TrimAudioDuration, - "SplitAudioChannels": SplitAudioChannels, - "AudioConcat": AudioConcat, - "AudioMerge": AudioMerge, - "AudioAdjustVolume": AudioAdjustVolume, - "EmptyAudio": EmptyAudio, -} +class AudioExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + EmptyLatentAudio, + VAEEncodeAudio, + VAEDecodeAudio, + SaveAudio, + SaveAudioMP3, + SaveAudioOpus, + LoadAudio, + PreviewAudio, + ConditioningStableAudio, + RecordAudio, + TrimAudioDuration, + SplitAudioChannels, + AudioConcat, + AudioMerge, + AudioAdjustVolume, + EmptyAudio, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "EmptyLatentAudio": "Empty Latent Audio", - "VAEEncodeAudio": "VAE Encode Audio", - "VAEDecodeAudio": "VAE Decode Audio", - "PreviewAudio": "Preview Audio", - "LoadAudio": "Load Audio", - "SaveAudio": "Save Audio (FLAC)", - "SaveAudioMP3": "Save Audio (MP3)", - "SaveAudioOpus": "Save Audio (Opus)", - "RecordAudio": "Record Audio", - "TrimAudioDuration": "Trim Audio Duration", - "SplitAudioChannels": "Split Audio Channels", - "AudioConcat": "Audio Concat", - "AudioMerge": "Audio Merge", - "AudioAdjustVolume": "Audio Adjust Volume", - "EmptyAudio": "Empty Audio", -} +async def comfy_entrypoint() -> AudioExtension: + return AudioExtension() From ecdc8697d53919a9178bf53ef327a110582db8ea Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 3 Dec 2025 19:49:28 -0800 Subject: [PATCH 680/759] Qwen Image Lora training fix from #11090 (#11094) --- comfy_extras/nodes_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index cb24ab709..19b8baaf4 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -623,7 +623,7 @@ class TrainLoraNode(io.ComfyNode): noise = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(seed) if multi_res: # use first latent as dummy latent if multi_res - latents = latents[0].repeat(num_images, 1, 1, 1) + latents = latents[0].repeat((num_images,) + ((1,) * (latents[0].ndim - 1))) guider.sample( noise.generate_noise({"samples": latents}), latents, From ea17add3c62197b10fd0b71d9169d339adc55c47 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 3 Dec 2025 20:15:15 -0800 Subject: [PATCH 681/759] Fix case where text encoders where running on the CPU instead of GPU. (#11095) --- comfy/sd.py | 2 ++ comfy/sd1_clip.py | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/comfy/sd.py b/comfy/sd.py index f9e5efab5..734bd2845 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -193,6 +193,7 @@ class CLIP: self.cond_stage_model.set_clip_options({"projected_pooled": False}) self.load_model() + self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device}) all_hooks.reset() self.patcher.patch_hooks(None) if show_pbar: @@ -240,6 +241,7 @@ class CLIP: self.cond_stage_model.set_clip_options({"projected_pooled": False}) self.load_model() + self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device}) o = self.cond_stage_model.encode_token_weights(tokens) cond, pooled = o[:2] if return_dict: diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 0fc9ab3db..503a51843 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -147,6 +147,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): self.layer_norm_hidden_state = layer_norm_hidden_state self.return_projected_pooled = return_projected_pooled self.return_attention_masks = return_attention_masks + self.execution_device = None if layer == "hidden": assert layer_idx is not None @@ -163,6 +164,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): def set_clip_options(self, options): layer_idx = options.get("layer", self.layer_idx) self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled) + self.execution_device = options.get("execution_device", self.execution_device) if isinstance(self.layer, list) or self.layer == "all": pass elif layer_idx is None or abs(layer_idx) > self.num_layers: @@ -175,6 +177,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): self.layer = self.options_default[0] self.layer_idx = self.options_default[1] self.return_projected_pooled = self.options_default[2] + self.execution_device = None def process_tokens(self, tokens, device): end_token = self.special_tokens.get("end", None) @@ -258,7 +261,11 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): return torch.cat(embeds_out), torch.tensor(attention_masks, device=device, dtype=torch.long), num_tokens, embeds_info def forward(self, tokens): - device = self.transformer.get_input_embeddings().weight.device + if self.execution_device is None: + device = self.transformer.get_input_embeddings().weight.device + else: + device = self.execution_device + embeds, attention_mask, num_tokens, embeds_info = self.process_tokens(tokens, device) attention_mask_model = None From 6be85c7920224b45bbc6417e00147815e78c12a9 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Thu, 4 Dec 2025 14:28:44 +1000 Subject: [PATCH 682/759] mp: use look-ahead actuals for stream offload VRAM calculation (#11096) TIL that the WAN TE has a 2GB weight followed by 16MB as the next size down. This means that team 8GB VRAM would fully offload the TE in async offload mode as it just multiplied this giant size my the num streams. Do the more complex logic of summing up the upcoming to-load weight sizes to avoid triple counting this massive weight. partial unload does the converse of recording the NS most recent unloads as they go. --- comfy/model_patcher.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index df2d8e827..3dcac3eef 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -699,12 +699,12 @@ class ModelPatcher: offloaded = [] offload_buffer = 0 loading.sort(reverse=True) - for x in loading: + for i, x in enumerate(loading): module_offload_mem, module_mem, n, m, params = x lowvram_weight = False - potential_offload = max(offload_buffer, module_offload_mem + (comfy.model_management.NUM_STREAMS * module_mem)) + potential_offload = max(offload_buffer, module_offload_mem + sum([ x1[1] for x1 in loading[i+1:i+1+comfy.model_management.NUM_STREAMS]])) lowvram_fits = mem_counter + module_mem + potential_offload < lowvram_model_memory weight_key = "{}.weight".format(n) @@ -876,14 +876,18 @@ class ModelPatcher: patch_counter = 0 unload_list = self._load_list() unload_list.sort() + offload_buffer = self.model.model_offload_buffer_memory + if len(unload_list) > 0: + NS = comfy.model_management.NUM_STREAMS + offload_weight_factor = [ min(offload_buffer / (NS + 1), unload_list[0][1]) ] * NS for unload in unload_list: if memory_to_free + offload_buffer - self.model.model_offload_buffer_memory < memory_freed: break module_offload_mem, module_mem, n, m, params = unload - potential_offload = module_offload_mem + (comfy.model_management.NUM_STREAMS * module_mem) + potential_offload = module_offload_mem + sum(offload_weight_factor) lowvram_possible = hasattr(m, "comfy_cast_weights") if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True: @@ -935,6 +939,8 @@ class ModelPatcher: m.comfy_patched_weights = False memory_freed += module_mem offload_buffer = max(offload_buffer, potential_offload) + offload_weight_factor.append(module_mem) + offload_weight_factor.pop(0) logging.debug("freed {}".format(n)) for param in params: From f4bdf5f8302ef10db99644a8672e614ddb29c473 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Fri, 5 Dec 2025 03:50:04 +1000 Subject: [PATCH 683/759] sd: revise hy VAE VRAM (#11105) This was recently collapsed down to rolling VAE through temporal. Clamp The time dimension. --- comfy/sd.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index 734bd2845..fe4dd65f8 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -483,8 +483,10 @@ class VAE: self.latent_dim = 3 self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1] self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1]) - self.memory_used_decode = lambda shape, dtype: (1500 * shape[2] * shape[3] * shape[4] * (4 * 8 * 8)) * model_management.dtype_size(dtype) - self.memory_used_encode = lambda shape, dtype: (900 * max(shape[2], 2) * shape[3] * shape[4]) * model_management.dtype_size(dtype) + #This is likely to significantly over-estimate with single image or low frame counts as the + #implementation is able to completely skip caching. Rework if used as an image only VAE + self.memory_used_decode = lambda shape, dtype: (2800 * min(8, ((shape[2] - 1) * 4) + 1) * shape[3] * shape[4] * (8 * 8)) * model_management.dtype_size(dtype) + self.memory_used_encode = lambda shape, dtype: (1400 * min(9, shape[2]) * shape[3] * shape[4]) * model_management.dtype_size(dtype) self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] elif "decoder.unpatcher3d.wavelets" in sd: self.upscale_ratio = (lambda a: max(0, a * 8 - 7), 8, 8) From 9bc893c5bbd2838bdd15ebd40e3a3e548ce3e4f0 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Fri, 5 Dec 2025 03:50:36 +1000 Subject: [PATCH 684/759] sd: bump HY1.5 VAE estimate (#11107) Im able to push vram above estimate on partial unload. Bump the estimate. This is experimentally determined with a 720P and 480P datapoint calibrating for 24GB VRAM total. --- comfy/sd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/sd.py b/comfy/sd.py index fe4dd65f8..03bdb33d5 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -471,7 +471,7 @@ class VAE: decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig}) self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype) - self.memory_used_decode = lambda shape, dtype: (2800 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (3600 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype) elif "decoder.conv_in.conv.weight" in sd: ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0} ddconfig["conv3d"] = True From 3c8456223c5f6a41af7d99219b391c8c58acb552 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 5 Dec 2025 00:05:28 +0200 Subject: [PATCH 685/759] [API Nodes]: fixes and refactor (#11104) * chore(api-nodes): applied ruff's pyupgrade(python3.10) to api-nodes client's to folder * chore(api-nodes): add validate_video_frame_count function from LTX PR * chore(api-nodes): replace deprecated V1 imports * fix(api-nodes): the types returned by the "poll_op" function are now correct. --- comfy_api_nodes/util/__init__.py | 2 + comfy_api_nodes/util/_helpers.py | 14 +-- comfy_api_nodes/util/client.py | 145 ++++++++++++----------- comfy_api_nodes/util/conversions.py | 21 ++-- comfy_api_nodes/util/download_helpers.py | 20 ++-- comfy_api_nodes/util/request_logger.py | 2 - comfy_api_nodes/util/upload_helpers.py | 16 ++- comfy_api_nodes/util/validation_utils.py | 61 ++++++---- 8 files changed, 146 insertions(+), 135 deletions(-) diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py index 80292fb3c..4cc22abfb 100644 --- a/comfy_api_nodes/util/__init__.py +++ b/comfy_api_nodes/util/__init__.py @@ -47,6 +47,7 @@ from .validation_utils import ( validate_string, validate_video_dimensions, validate_video_duration, + validate_video_frame_count, ) __all__ = [ @@ -94,6 +95,7 @@ __all__ = [ "validate_string", "validate_video_dimensions", "validate_video_duration", + "validate_video_frame_count", # Misc functions "get_fs_object_size", ] diff --git a/comfy_api_nodes/util/_helpers.py b/comfy_api_nodes/util/_helpers.py index 328fe5227..491e6b6a8 100644 --- a/comfy_api_nodes/util/_helpers.py +++ b/comfy_api_nodes/util/_helpers.py @@ -2,8 +2,8 @@ import asyncio import contextlib import os import time +from collections.abc import Callable from io import BytesIO -from typing import Callable, Optional, Union from comfy.cli_args import args from comfy.model_management import processing_interrupted @@ -35,12 +35,12 @@ def default_base_url() -> str: async def sleep_with_interrupt( seconds: float, - node_cls: Optional[type[IO.ComfyNode]], - label: Optional[str] = None, - start_ts: Optional[float] = None, - estimated_total: Optional[int] = None, + node_cls: type[IO.ComfyNode] | None, + label: str | None = None, + start_ts: float | None = None, + estimated_total: int | None = None, *, - display_callback: Optional[Callable[[type[IO.ComfyNode], str, int, Optional[int]], None]] = None, + display_callback: Callable[[type[IO.ComfyNode], str, int, int | None], None] | None = None, ): """ Sleep in 1s slices while: @@ -65,7 +65,7 @@ def mimetype_to_extension(mime_type: str) -> str: return mime_type.split("/")[-1].lower() -def get_fs_object_size(path_or_object: Union[str, BytesIO]) -> int: +def get_fs_object_size(path_or_object: str | BytesIO) -> int: if isinstance(path_or_object, str): return os.path.getsize(path_or_object) return len(path_or_object.getvalue()) diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py index bf01d7d36..bf37cba5f 100644 --- a/comfy_api_nodes/util/client.py +++ b/comfy_api_nodes/util/client.py @@ -4,10 +4,11 @@ import json import logging import time import uuid +from collections.abc import Callable, Iterable from dataclasses import dataclass from enum import Enum from io import BytesIO -from typing import Any, Callable, Iterable, Literal, Optional, Type, TypeVar, Union +from typing import Any, Literal, TypeVar from urllib.parse import urljoin, urlparse import aiohttp @@ -37,8 +38,8 @@ class ApiEndpoint: path: str, method: Literal["GET", "POST", "PUT", "DELETE", "PATCH"] = "GET", *, - query_params: Optional[dict[str, Any]] = None, - headers: Optional[dict[str, str]] = None, + query_params: dict[str, Any] | None = None, + headers: dict[str, str] | None = None, ): self.path = path self.method = method @@ -52,18 +53,18 @@ class _RequestConfig: endpoint: ApiEndpoint timeout: float content_type: str - data: Optional[dict[str, Any]] - files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] - multipart_parser: Optional[Callable] + data: dict[str, Any] | None + files: dict[str, Any] | list[tuple[str, Any]] | None + multipart_parser: Callable | None max_retries: int retry_delay: float retry_backoff: float wait_label: str = "Waiting" monitor_progress: bool = True - estimated_total: Optional[int] = None - final_label_on_success: Optional[str] = "Completed" - progress_origin_ts: Optional[float] = None - price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None + estimated_total: int | None = None + final_label_on_success: str | None = "Completed" + progress_origin_ts: float | None = None + price_extractor: Callable[[dict[str, Any]], float | None] | None = None @dataclass @@ -71,10 +72,10 @@ class _PollUIState: started: float status_label: str = "Queued" is_queued: bool = True - price: Optional[float] = None - estimated_duration: Optional[int] = None + price: float | None = None + estimated_duration: int | None = None base_processing_elapsed: float = 0.0 # sum of completed active intervals - active_since: Optional[float] = None # start time of current active interval (None if queued) + active_since: float | None = None # start time of current active interval (None if queued) _RETRY_STATUS = {408, 429, 500, 502, 503, 504} @@ -87,20 +88,20 @@ async def sync_op( cls: type[IO.ComfyNode], endpoint: ApiEndpoint, *, - response_model: Type[M], - price_extractor: Optional[Callable[[M], Optional[float]]] = None, - data: Optional[BaseModel] = None, - files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None, + response_model: type[M], + price_extractor: Callable[[M | Any], float | None] | None = None, + data: BaseModel | None = None, + files: dict[str, Any] | list[tuple[str, Any]] | None = None, content_type: str = "application/json", timeout: float = 3600.0, - multipart_parser: Optional[Callable] = None, + multipart_parser: Callable | None = None, max_retries: int = 3, retry_delay: float = 1.0, retry_backoff: float = 2.0, wait_label: str = "Waiting for server", - estimated_duration: Optional[int] = None, - final_label_on_success: Optional[str] = "Completed", - progress_origin_ts: Optional[float] = None, + estimated_duration: int | None = None, + final_label_on_success: str | None = "Completed", + progress_origin_ts: float | None = None, monitor_progress: bool = True, ) -> M: raw = await sync_op_raw( @@ -131,22 +132,22 @@ async def poll_op( cls: type[IO.ComfyNode], poll_endpoint: ApiEndpoint, *, - response_model: Type[M], - status_extractor: Callable[[M], Optional[Union[str, int]]], - progress_extractor: Optional[Callable[[M], Optional[int]]] = None, - price_extractor: Optional[Callable[[M], Optional[float]]] = None, - completed_statuses: Optional[list[Union[str, int]]] = None, - failed_statuses: Optional[list[Union[str, int]]] = None, - queued_statuses: Optional[list[Union[str, int]]] = None, - data: Optional[BaseModel] = None, + response_model: type[M], + status_extractor: Callable[[M | Any], str | int | None], + progress_extractor: Callable[[M | Any], int | None] | None = None, + price_extractor: Callable[[M | Any], float | None] | None = None, + completed_statuses: list[str | int] | None = None, + failed_statuses: list[str | int] | None = None, + queued_statuses: list[str | int] | None = None, + data: BaseModel | None = None, poll_interval: float = 5.0, max_poll_attempts: int = 120, timeout_per_poll: float = 120.0, max_retries_per_poll: int = 3, retry_delay_per_poll: float = 1.0, retry_backoff_per_poll: float = 2.0, - estimated_duration: Optional[int] = None, - cancel_endpoint: Optional[ApiEndpoint] = None, + estimated_duration: int | None = None, + cancel_endpoint: ApiEndpoint | None = None, cancel_timeout: float = 10.0, ) -> M: raw = await poll_op_raw( @@ -178,22 +179,22 @@ async def sync_op_raw( cls: type[IO.ComfyNode], endpoint: ApiEndpoint, *, - price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None, - data: Optional[Union[dict[str, Any], BaseModel]] = None, - files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None, + price_extractor: Callable[[dict[str, Any]], float | None] | None = None, + data: dict[str, Any] | BaseModel | None = None, + files: dict[str, Any] | list[tuple[str, Any]] | None = None, content_type: str = "application/json", timeout: float = 3600.0, - multipart_parser: Optional[Callable] = None, + multipart_parser: Callable | None = None, max_retries: int = 3, retry_delay: float = 1.0, retry_backoff: float = 2.0, wait_label: str = "Waiting for server", - estimated_duration: Optional[int] = None, + estimated_duration: int | None = None, as_binary: bool = False, - final_label_on_success: Optional[str] = "Completed", - progress_origin_ts: Optional[float] = None, + final_label_on_success: str | None = "Completed", + progress_origin_ts: float | None = None, monitor_progress: bool = True, -) -> Union[dict[str, Any], bytes]: +) -> dict[str, Any] | bytes: """ Make a single network request. - If as_binary=False (default): returns JSON dict (or {'_raw': ''} if non-JSON). @@ -229,21 +230,21 @@ async def poll_op_raw( cls: type[IO.ComfyNode], poll_endpoint: ApiEndpoint, *, - status_extractor: Callable[[dict[str, Any]], Optional[Union[str, int]]], - progress_extractor: Optional[Callable[[dict[str, Any]], Optional[int]]] = None, - price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None, - completed_statuses: Optional[list[Union[str, int]]] = None, - failed_statuses: Optional[list[Union[str, int]]] = None, - queued_statuses: Optional[list[Union[str, int]]] = None, - data: Optional[Union[dict[str, Any], BaseModel]] = None, + status_extractor: Callable[[dict[str, Any]], str | int | None], + progress_extractor: Callable[[dict[str, Any]], int | None] | None = None, + price_extractor: Callable[[dict[str, Any]], float | None] | None = None, + completed_statuses: list[str | int] | None = None, + failed_statuses: list[str | int] | None = None, + queued_statuses: list[str | int] | None = None, + data: dict[str, Any] | BaseModel | None = None, poll_interval: float = 5.0, max_poll_attempts: int = 120, timeout_per_poll: float = 120.0, max_retries_per_poll: int = 3, retry_delay_per_poll: float = 1.0, retry_backoff_per_poll: float = 2.0, - estimated_duration: Optional[int] = None, - cancel_endpoint: Optional[ApiEndpoint] = None, + estimated_duration: int | None = None, + cancel_endpoint: ApiEndpoint | None = None, cancel_timeout: float = 10.0, ) -> dict[str, Any]: """ @@ -261,7 +262,7 @@ async def poll_op_raw( consumed_attempts = 0 # counts only non-queued polls progress_bar = utils.ProgressBar(100) if progress_extractor else None - last_progress: Optional[int] = None + last_progress: int | None = None state = _PollUIState(started=started, estimated_duration=estimated_duration) stop_ticker = asyncio.Event() @@ -420,10 +421,10 @@ async def poll_op_raw( def _display_text( node_cls: type[IO.ComfyNode], - text: Optional[str], + text: str | None, *, - status: Optional[Union[str, int]] = None, - price: Optional[float] = None, + status: str | int | None = None, + price: float | None = None, ) -> None: display_lines: list[str] = [] if status: @@ -440,13 +441,13 @@ def _display_text( def _display_time_progress( node_cls: type[IO.ComfyNode], - status: Optional[Union[str, int]], + status: str | int | None, elapsed_seconds: int, - estimated_total: Optional[int] = None, + estimated_total: int | None = None, *, - price: Optional[float] = None, - is_queued: Optional[bool] = None, - processing_elapsed_seconds: Optional[int] = None, + price: float | None = None, + is_queued: bool | None = None, + processing_elapsed_seconds: int | None = None, ) -> None: if estimated_total is not None and estimated_total > 0 and is_queued is False: pe = processing_elapsed_seconds if processing_elapsed_seconds is not None else elapsed_seconds @@ -488,7 +489,7 @@ def _unpack_tuple(t: tuple) -> tuple[str, Any, str]: raise ValueError("files tuple must be (filename, file[, content_type])") -def _merge_params(endpoint_params: dict[str, Any], method: str, data: Optional[dict[str, Any]]) -> dict[str, Any]: +def _merge_params(endpoint_params: dict[str, Any], method: str, data: dict[str, Any] | None) -> dict[str, Any]: params = dict(endpoint_params or {}) if method.upper() == "GET" and data: for k, v in data.items(): @@ -534,9 +535,9 @@ def _generate_operation_id(method: str, path: str, attempt: int) -> str: def _snapshot_request_body_for_logging( content_type: str, method: str, - data: Optional[dict[str, Any]], - files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]], -) -> Optional[Union[dict[str, Any], str]]: + data: dict[str, Any] | None, + files: dict[str, Any] | list[tuple[str, Any]] | None, +) -> dict[str, Any] | str | None: if method.upper() == "GET": return None if content_type == "multipart/form-data": @@ -586,13 +587,13 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): attempt = 0 delay = cfg.retry_delay operation_succeeded: bool = False - final_elapsed_seconds: Optional[int] = None - extracted_price: Optional[float] = None + final_elapsed_seconds: int | None = None + extracted_price: float | None = None while True: attempt += 1 stop_event = asyncio.Event() - monitor_task: Optional[asyncio.Task] = None - sess: Optional[aiohttp.ClientSession] = None + monitor_task: asyncio.Task | None = None + sess: aiohttp.ClientSession | None = None operation_id = _generate_operation_id(method, cfg.endpoint.path, attempt) logging.debug("[DEBUG] HTTP %s %s (attempt %d)", method, url, attempt) @@ -887,7 +888,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool): ) -def _validate_or_raise(response_model: Type[M], payload: Any) -> M: +def _validate_or_raise(response_model: type[M], payload: Any) -> M: try: return response_model.model_validate(payload) except Exception as e: @@ -902,9 +903,9 @@ def _validate_or_raise(response_model: Type[M], payload: Any) -> M: def _wrap_model_extractor( - response_model: Type[M], - extractor: Optional[Callable[[M], Any]], -) -> Optional[Callable[[dict[str, Any]], Any]]: + response_model: type[M], + extractor: Callable[[M], Any] | None, +) -> Callable[[dict[str, Any]], Any] | None: """Wrap a typed extractor so it can be used by the dict-based poller. Validates the dict into `response_model` before invoking `extractor`. Uses a small per-wrapper cache keyed by `id(dict)` to avoid re-validating @@ -929,10 +930,10 @@ def _wrap_model_extractor( return _wrapped -def _normalize_statuses(values: Optional[Iterable[Union[str, int]]]) -> set[Union[str, int]]: +def _normalize_statuses(values: Iterable[str | int] | None) -> set[str | int]: if not values: return set() - out: set[Union[str, int]] = set() + out: set[str | int] = set() for v in values: nv = _normalize_status_value(v) if nv is not None: @@ -940,7 +941,7 @@ def _normalize_statuses(values: Optional[Iterable[Union[str, int]]]) -> set[Unio return out -def _normalize_status_value(val: Union[str, int, None]) -> Union[str, int, None]: +def _normalize_status_value(val: str | int | None) -> str | int | None: if isinstance(val, str): return val.strip().lower() return val diff --git a/comfy_api_nodes/util/conversions.py b/comfy_api_nodes/util/conversions.py index 971dc57de..c57457580 100644 --- a/comfy_api_nodes/util/conversions.py +++ b/comfy_api_nodes/util/conversions.py @@ -4,7 +4,6 @@ import math import mimetypes import uuid from io import BytesIO -from typing import Optional import av import numpy as np @@ -12,8 +11,7 @@ import torch from PIL import Image from comfy.utils import common_upscale -from comfy_api.latest import Input, InputImpl -from comfy_api.util import VideoCodec, VideoContainer +from comfy_api.latest import Input, InputImpl, Types from ._helpers import mimetype_to_extension @@ -57,7 +55,7 @@ def image_tensor_pair_to_batch(image1: torch.Tensor, image2: torch.Tensor) -> to def tensor_to_bytesio( image: torch.Tensor, - name: Optional[str] = None, + name: str | None = None, total_pixels: int = 2048 * 2048, mime_type: str = "image/png", ) -> BytesIO: @@ -177,8 +175,8 @@ def audio_to_base64_string(audio: Input.Audio, container_format: str = "mp4", co def video_to_base64_string( video: Input.Video, - container_format: VideoContainer = None, - codec: VideoCodec = None + container_format: Types.VideoContainer | None = None, + codec: Types.VideoCodec | None = None, ) -> str: """ Converts a video input to a base64 string. @@ -189,12 +187,11 @@ def video_to_base64_string( codec: Optional codec to use (defaults to video.codec if available) """ video_bytes_io = BytesIO() - - # Use provided format/codec if specified, otherwise use video's own if available - format_to_use = container_format if container_format is not None else getattr(video, 'container', VideoContainer.MP4) - codec_to_use = codec if codec is not None else getattr(video, 'codec', VideoCodec.H264) - - video.save_to(video_bytes_io, format=format_to_use, codec=codec_to_use) + video.save_to( + video_bytes_io, + format=container_format or getattr(video, "container", Types.VideoContainer.MP4), + codec=codec or getattr(video, "codec", Types.VideoCodec.H264), + ) video_bytes_io.seek(0) return base64.b64encode(video_bytes_io.getvalue()).decode("utf-8") diff --git a/comfy_api_nodes/util/download_helpers.py b/comfy_api_nodes/util/download_helpers.py index 14207dc68..3e0d0352d 100644 --- a/comfy_api_nodes/util/download_helpers.py +++ b/comfy_api_nodes/util/download_helpers.py @@ -3,15 +3,15 @@ import contextlib import uuid from io import BytesIO from pathlib import Path -from typing import IO, Optional, Union +from typing import IO from urllib.parse import urljoin, urlparse import aiohttp import torch from aiohttp.client_exceptions import ClientError, ContentTypeError -from comfy_api.input_impl import VideoFromFile from comfy_api.latest import IO as COMFY_IO +from comfy_api.latest import InputImpl from . import request_logger from ._helpers import ( @@ -29,9 +29,9 @@ _RETRY_STATUS = {408, 429, 500, 502, 503, 504} async def download_url_to_bytesio( url: str, - dest: Optional[Union[BytesIO, IO[bytes], str, Path]], + dest: BytesIO | IO[bytes] | str | Path | None, *, - timeout: Optional[float] = None, + timeout: float | None = None, max_retries: int = 5, retry_delay: float = 1.0, retry_backoff: float = 2.0, @@ -71,10 +71,10 @@ async def download_url_to_bytesio( is_path_sink = isinstance(dest, (str, Path)) fhandle = None - session: Optional[aiohttp.ClientSession] = None - stop_evt: Optional[asyncio.Event] = None - monitor_task: Optional[asyncio.Task] = None - req_task: Optional[asyncio.Task] = None + session: aiohttp.ClientSession | None = None + stop_evt: asyncio.Event | None = None + monitor_task: asyncio.Task | None = None + req_task: asyncio.Task | None = None try: with contextlib.suppress(Exception): @@ -234,11 +234,11 @@ async def download_url_to_video_output( timeout: float = None, max_retries: int = 5, cls: type[COMFY_IO.ComfyNode] = None, -) -> VideoFromFile: +) -> InputImpl.VideoFromFile: """Downloads a video from a URL and returns a `VIDEO` output.""" result = BytesIO() await download_url_to_bytesio(video_url, result, timeout=timeout, max_retries=max_retries, cls=cls) - return VideoFromFile(result) + return InputImpl.VideoFromFile(result) async def download_url_as_bytesio( diff --git a/comfy_api_nodes/util/request_logger.py b/comfy_api_nodes/util/request_logger.py index ac52e2eab..e0cb4428d 100644 --- a/comfy_api_nodes/util/request_logger.py +++ b/comfy_api_nodes/util/request_logger.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import datetime import hashlib import json diff --git a/comfy_api_nodes/util/upload_helpers.py b/comfy_api_nodes/util/upload_helpers.py index 0532bea9a..b8d33f4d1 100644 --- a/comfy_api_nodes/util/upload_helpers.py +++ b/comfy_api_nodes/util/upload_helpers.py @@ -4,15 +4,13 @@ import logging import time import uuid from io import BytesIO -from typing import Optional from urllib.parse import urlparse import aiohttp import torch from pydantic import BaseModel, Field -from comfy_api.latest import IO, Input -from comfy_api.util import VideoCodec, VideoContainer +from comfy_api.latest import IO, Input, Types from . import request_logger from ._helpers import is_processing_interrupted, sleep_with_interrupt @@ -32,7 +30,7 @@ from .conversions import ( class UploadRequest(BaseModel): file_name: str = Field(..., description="Filename to upload") - content_type: Optional[str] = Field( + content_type: str | None = Field( None, description="Mime type of the file. For example: image/png, image/jpeg, video/mp4, etc.", ) @@ -56,7 +54,7 @@ async def upload_images_to_comfyapi( Uploads images to ComfyUI API and returns download URLs. To upload multiple images, stack them in the batch dimension first. """ - # if batch, try to upload each file if max_images is greater than 0 + # if batched, try to upload each file if max_images is greater than 0 download_urls: list[str] = [] is_batch = len(image.shape) > 3 batch_len = image.shape[0] if is_batch else 1 @@ -100,9 +98,9 @@ async def upload_video_to_comfyapi( cls: type[IO.ComfyNode], video: Input.Video, *, - container: VideoContainer = VideoContainer.MP4, - codec: VideoCodec = VideoCodec.H264, - max_duration: Optional[int] = None, + container: Types.VideoContainer = Types.VideoContainer.MP4, + codec: Types.VideoCodec = Types.VideoCodec.H264, + max_duration: int | None = None, wait_label: str | None = "Uploading", ) -> str: """ @@ -220,7 +218,7 @@ async def upload_file( return monitor_task = asyncio.create_task(_monitor()) - sess: Optional[aiohttp.ClientSession] = None + sess: aiohttp.ClientSession | None = None try: try: request_logger.log_request_response( diff --git a/comfy_api_nodes/util/validation_utils.py b/comfy_api_nodes/util/validation_utils.py index ec7006aed..f01edea96 100644 --- a/comfy_api_nodes/util/validation_utils.py +++ b/comfy_api_nodes/util/validation_utils.py @@ -1,9 +1,7 @@ import logging -from typing import Optional import torch -from comfy_api.input.video_types import VideoInput from comfy_api.latest import Input @@ -18,10 +16,10 @@ def get_image_dimensions(image: torch.Tensor) -> tuple[int, int]: def validate_image_dimensions( image: torch.Tensor, - min_width: Optional[int] = None, - max_width: Optional[int] = None, - min_height: Optional[int] = None, - max_height: Optional[int] = None, + min_width: int | None = None, + max_width: int | None = None, + min_height: int | None = None, + max_height: int | None = None, ): height, width = get_image_dimensions(image) @@ -37,8 +35,8 @@ def validate_image_dimensions( def validate_image_aspect_ratio( image: torch.Tensor, - min_ratio: Optional[tuple[float, float]] = None, # e.g. (1, 4) - max_ratio: Optional[tuple[float, float]] = None, # e.g. (4, 1) + min_ratio: tuple[float, float] | None = None, # e.g. (1, 4) + max_ratio: tuple[float, float] | None = None, # e.g. (4, 1) *, strict: bool = True, # True -> (min, max); False -> [min, max] ) -> float: @@ -54,8 +52,8 @@ def validate_image_aspect_ratio( def validate_images_aspect_ratio_closeness( first_image: torch.Tensor, second_image: torch.Tensor, - min_rel: float, # e.g. 0.8 - max_rel: float, # e.g. 1.25 + min_rel: float, # e.g. 0.8 + max_rel: float, # e.g. 1.25 *, strict: bool = False, # True -> (min, max); False -> [min, max] ) -> float: @@ -84,8 +82,8 @@ def validate_images_aspect_ratio_closeness( def validate_aspect_ratio_string( aspect_ratio: str, - min_ratio: Optional[tuple[float, float]] = None, # e.g. (1, 4) - max_ratio: Optional[tuple[float, float]] = None, # e.g. (4, 1) + min_ratio: tuple[float, float] | None = None, # e.g. (1, 4) + max_ratio: tuple[float, float] | None = None, # e.g. (4, 1) *, strict: bool = False, # True -> (min, max); False -> [min, max] ) -> float: @@ -97,10 +95,10 @@ def validate_aspect_ratio_string( def validate_video_dimensions( video: Input.Video, - min_width: Optional[int] = None, - max_width: Optional[int] = None, - min_height: Optional[int] = None, - max_height: Optional[int] = None, + min_width: int | None = None, + max_width: int | None = None, + min_height: int | None = None, + max_height: int | None = None, ): try: width, height = video.get_dimensions() @@ -120,8 +118,8 @@ def validate_video_dimensions( def validate_video_duration( video: Input.Video, - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, + min_duration: float | None = None, + max_duration: float | None = None, ): try: duration = video.get_duration() @@ -136,6 +134,23 @@ def validate_video_duration( raise ValueError(f"Video duration must be at most {max_duration}s, got {duration}s") +def validate_video_frame_count( + video: Input.Video, + min_frame_count: int | None = None, + max_frame_count: int | None = None, +): + try: + frame_count = video.get_frame_count() + except Exception as e: + logging.error("Error getting frame count of video: %s", e) + return + + if min_frame_count is not None and min_frame_count > frame_count: + raise ValueError(f"Video frame count must be at least {min_frame_count}, got {frame_count}") + if max_frame_count is not None and frame_count > max_frame_count: + raise ValueError(f"Video frame count must be at most {max_frame_count}, got {frame_count}") + + def get_number_of_images(images): if isinstance(images, torch.Tensor): return images.shape[0] if images.ndim >= 4 else 1 @@ -144,8 +159,8 @@ def get_number_of_images(images): def validate_audio_duration( audio: Input.Audio, - min_duration: Optional[float] = None, - max_duration: Optional[float] = None, + min_duration: float | None = None, + max_duration: float | None = None, ) -> None: sr = int(audio["sample_rate"]) dur = int(audio["waveform"].shape[-1]) / sr @@ -177,7 +192,7 @@ def validate_string( ) -def validate_container_format_is_mp4(video: VideoInput) -> None: +def validate_container_format_is_mp4(video: Input.Video) -> None: """Validates video container format is MP4.""" container_format = video.get_container_format() if container_format not in ["mp4", "mov,mp4,m4a,3gp,3g2,mj2"]: @@ -194,8 +209,8 @@ def _ratio_from_tuple(r: tuple[float, float]) -> float: def _assert_ratio_bounds( ar: float, *, - min_ratio: Optional[tuple[float, float]] = None, - max_ratio: Optional[tuple[float, float]] = None, + min_ratio: tuple[float, float] | None = None, + max_ratio: tuple[float, float] | None = None, strict: bool = True, ) -> None: """Validate a numeric aspect ratio against optional min/max ratio bounds.""" From 35fa091340c60612dfb71cb6822dc23b99a5dac2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 4 Dec 2025 19:52:09 -0800 Subject: [PATCH 686/759] Forgot to put this in README. (#11112) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 91fb510e1..ed857df9f 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/) - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/) - [Wan 2.2](https://comfyanonymous.github.io/ComfyUI_examples/wan22/) + - [Hunyuan Video 1.5](https://docs.comfy.org/tutorials/video/hunyuan/hunyuan-video-1-5) - Audio Models - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/) - [ACE Step](https://comfyanonymous.github.io/ComfyUI_examples/audio/) From 0ec05b1481d12b299bc945dbd407b773cfb66483 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Fri, 5 Dec 2025 11:05:38 -0800 Subject: [PATCH 687/759] Remove line made unnecessary (and wrong) after transformer_options was added to NextDiT's _forward definition (#11118) --- comfy/ldm/lumina/model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index f1c1a0ec3..6c24fed9b 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -586,7 +586,6 @@ class NextDiT(nn.Module): cap_feats = self.cap_embedder(cap_feats) # (N, L, D) # todo check if able to batchify w.o. redundant compute patches = transformer_options.get("patches", {}) - transformer_options = kwargs.get("transformer_options", {}) x_is_tensor = isinstance(x, torch.Tensor) img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options) freqs_cis = freqs_cis.to(img.device) From 43071e3de3780f984a46549e90935a0bf405e9df Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 5 Dec 2025 11:35:42 -0800 Subject: [PATCH 688/759] Make old scaled fp8 format use the new mixed quant ops system. (#11000) --- comfy/model_base.py | 14 +- comfy/model_detection.py | 33 +--- comfy/model_patcher.py | 20 +-- comfy/ops.py | 145 +++++++----------- comfy/quant_ops.py | 30 ++-- comfy/sd.py | 68 ++++++-- comfy/sd1_clip.py | 22 +-- comfy/supported_models_base.py | 3 +- comfy/text_encoders/cosmos.py | 12 +- comfy/text_encoders/flux.py | 12 +- comfy/text_encoders/genmo.py | 6 +- comfy/text_encoders/hidream.py | 10 +- comfy/text_encoders/hunyuan_image.py | 12 +- comfy/text_encoders/hunyuan_video.py | 23 ++- comfy/text_encoders/lumina2.py | 6 +- comfy/text_encoders/omnigen2.py | 6 +- comfy/text_encoders/ovis.py | 5 +- comfy/text_encoders/pixart_t5.py | 6 +- comfy/text_encoders/qwen_image.py | 6 +- comfy/text_encoders/sd3_clip.py | 19 +-- comfy/text_encoders/wan.py | 6 +- comfy/text_encoders/z_image.py | 5 +- comfy/utils.py | 66 ++++++++ .../comfy_quant/test_mixed_precision.py | 18 ++- 24 files changed, 278 insertions(+), 275 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 9b76c285e..3cedd4f31 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -134,7 +134,7 @@ class BaseModel(torch.nn.Module): if not unet_config.get("disable_unet_model_creation", False): if model_config.custom_operations is None: fp8 = model_config.optimizations.get("fp8", False) - operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8, model_config=model_config) + operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, model_config=model_config) else: operations = model_config.custom_operations self.diffusion_model = unet_model(**unet_config, device=device, operations=operations) @@ -329,18 +329,6 @@ class BaseModel(torch.nn.Module): extra_sds.append(self.model_config.process_clip_vision_state_dict_for_saving(clip_vision_state_dict)) unet_state_dict = self.diffusion_model.state_dict() - - if self.model_config.scaled_fp8 is not None: - unet_state_dict["scaled_fp8"] = torch.tensor([], dtype=self.model_config.scaled_fp8) - - # Save mixed precision metadata - if hasattr(self.model_config, 'layer_quant_config') and self.model_config.layer_quant_config: - metadata = { - "format_version": "1.0", - "layers": self.model_config.layer_quant_config - } - unet_state_dict["_quantization_metadata"] = metadata - unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict) if self.model_type == ModelType.V_PREDICTION: diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 7d0517e61..fd1907627 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -6,20 +6,6 @@ import math import logging import torch - -def detect_layer_quantization(metadata): - quant_key = "_quantization_metadata" - if metadata is not None and quant_key in metadata: - quant_metadata = metadata.pop(quant_key) - quant_metadata = json.loads(quant_metadata) - if isinstance(quant_metadata, dict) and "layers" in quant_metadata: - logging.info(f"Found quantization metadata (version {quant_metadata.get('format_version', 'unknown')})") - return quant_metadata["layers"] - else: - raise ValueError("Invalid quantization metadata format") - return None - - def count_blocks(state_dict_keys, prefix_string): count = 0 while True: @@ -767,22 +753,11 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal if model_config is None and use_base_if_no_match: model_config = comfy.supported_models_base.BASE(unet_config) - scaled_fp8_key = "{}scaled_fp8".format(unet_key_prefix) - if scaled_fp8_key in state_dict: - scaled_fp8_weight = state_dict.pop(scaled_fp8_key) - model_config.scaled_fp8 = scaled_fp8_weight.dtype - if model_config.scaled_fp8 == torch.float32: - model_config.scaled_fp8 = torch.float8_e4m3fn - if scaled_fp8_weight.nelement() == 2: - model_config.optimizations["fp8"] = False - else: - model_config.optimizations["fp8"] = True - # Detect per-layer quantization (mixed precision) - layer_quant_config = detect_layer_quantization(metadata) - if layer_quant_config: - model_config.layer_quant_config = layer_quant_config - logging.info(f"Detected mixed precision quantization: {len(layer_quant_config)} layers quantized") + quant_config = comfy.utils.detect_layer_quantization(state_dict, unet_key_prefix) + if quant_config: + model_config.quant_config = quant_config + logging.info("Detected mixed precision quantization") return model_config diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 3dcac3eef..215784874 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -126,27 +126,11 @@ class LowVramPatch: def __init__(self, key, patches, convert_func=None, set_func=None): self.key = key self.patches = patches - self.convert_func = convert_func + self.convert_func = convert_func # TODO: remove self.set_func = set_func def __call__(self, weight): - intermediate_dtype = weight.dtype - if self.convert_func is not None: - weight = self.convert_func(weight, inplace=False) - - if intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops - intermediate_dtype = torch.float32 - out = comfy.lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype) - if self.set_func is None: - return comfy.float.stochastic_rounding(out, weight.dtype, seed=string_to_seed(self.key)) - else: - return self.set_func(out, seed=string_to_seed(self.key), return_weight=True) - - out = comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype) - if self.set_func is not None: - return self.set_func(out, seed=string_to_seed(self.key), return_weight=True).to(dtype=intermediate_dtype) - else: - return out + return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=weight.dtype) #The above patch logic may cast up the weight to fp32, and do math. Go with fp32 x 3 LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR = 3 diff --git a/comfy/ops.py b/comfy/ops.py index eae434e68..dc06709a1 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -23,6 +23,7 @@ from comfy.cli_args import args, PerformanceFeature import comfy.float import comfy.rmsnorm import contextlib +import json def run_every_op(): if torch.compiler.is_compiling(): @@ -422,22 +423,12 @@ def fp8_linear(self, input): if input.ndim == 3 or input.ndim == 2: w, bias, offload_stream = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype, offloadable=True) + scale_weight = torch.ones((), device=input.device, dtype=torch.float32) - scale_weight = self.scale_weight - scale_input = self.scale_input - if scale_weight is None: - scale_weight = torch.ones((), device=input.device, dtype=torch.float32) - else: - scale_weight = scale_weight.to(input.device) - - if scale_input is None: - scale_input = torch.ones((), device=input.device, dtype=torch.float32) - input = torch.clamp(input, min=-448, max=448, out=input) - layout_params_weight = {'scale': scale_input, 'orig_dtype': input_dtype} - quantized_input = QuantizedTensor(input.to(dtype).contiguous(), "TensorCoreFP8Layout", layout_params_weight) - else: - scale_input = scale_input.to(input.device) - quantized_input = QuantizedTensor.from_float(input, "TensorCoreFP8Layout", scale=scale_input, dtype=dtype) + scale_input = torch.ones((), device=input.device, dtype=torch.float32) + input = torch.clamp(input, min=-448, max=448, out=input) + layout_params_weight = {'scale': scale_input, 'orig_dtype': input_dtype} + quantized_input = QuantizedTensor(input.to(dtype).contiguous(), "TensorCoreFP8Layout", layout_params_weight) # Wrap weight in QuantizedTensor - this enables unified dispatch # Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py! @@ -458,7 +449,7 @@ class fp8_ops(manual_cast): return None def forward_comfy_cast_weights(self, input): - if not self.training: + if len(self.weight_function) == 0 and len(self.bias_function) == 0: try: out = fp8_linear(self, input) if out is not None: @@ -471,59 +462,6 @@ class fp8_ops(manual_cast): uncast_bias_weight(self, weight, bias, offload_stream) return x -def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None): - logging.info("Using scaled fp8: fp8 matrix mult: {}, scale input: {}".format(fp8_matrix_mult, scale_input)) - class scaled_fp8_op(manual_cast): - class Linear(manual_cast.Linear): - def __init__(self, *args, **kwargs): - if override_dtype is not None: - kwargs['dtype'] = override_dtype - super().__init__(*args, **kwargs) - - def reset_parameters(self): - if not hasattr(self, 'scale_weight'): - self.scale_weight = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False) - - if not scale_input: - self.scale_input = None - - if not hasattr(self, 'scale_input'): - self.scale_input = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False) - return None - - def forward_comfy_cast_weights(self, input): - if fp8_matrix_mult: - out = fp8_linear(self, input) - if out is not None: - return out - - weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) - - if weight.numel() < input.numel(): #TODO: optimize - x = torch.nn.functional.linear(input, weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype), bias) - else: - x = torch.nn.functional.linear(input * self.scale_weight.to(device=weight.device, dtype=weight.dtype), weight, bias) - uncast_bias_weight(self, weight, bias, offload_stream) - return x - - def convert_weight(self, weight, inplace=False, **kwargs): - if inplace: - weight *= self.scale_weight.to(device=weight.device, dtype=weight.dtype) - return weight - else: - return weight.to(dtype=torch.float32) * self.scale_weight.to(device=weight.device, dtype=torch.float32) - - def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs): - weight = comfy.float.stochastic_rounding(weight / self.scale_weight.to(device=weight.device, dtype=weight.dtype), self.weight.dtype, seed=seed) - if return_weight: - return weight - if inplace_update: - self.weight.data.copy_(weight) - else: - self.weight = torch.nn.Parameter(weight, requires_grad=False) - - return scaled_fp8_op - CUBLAS_IS_AVAILABLE = False try: from cublas_ops import CublasLinear @@ -550,9 +488,9 @@ if CUBLAS_IS_AVAILABLE: from .quant_ops import QuantizedTensor, QUANT_ALGOS -def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False): +def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False): class MixedPrecisionOps(manual_cast): - _layer_quant_config = layer_quant_config + _quant_config = quant_config _compute_dtype = compute_dtype _full_precision_mm = full_precision_mm @@ -595,27 +533,36 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful manually_loaded_keys = [weight_key] - if layer_name not in MixedPrecisionOps._layer_quant_config: + layer_conf = state_dict.pop(f"{prefix}comfy_quant", None) + if layer_conf is not None: + layer_conf = json.loads(layer_conf.numpy().tobytes()) + + if layer_conf is None: self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False) else: - quant_format = MixedPrecisionOps._layer_quant_config[layer_name].get("format", None) - if quant_format is None: + self.quant_format = layer_conf.get("format", None) + if not self._full_precision_mm: + self._full_precision_mm = layer_conf.get("full_precision_matrix_mult", False) + + if self.quant_format is None: raise ValueError(f"Unknown quantization format for layer {layer_name}") - qconfig = QUANT_ALGOS[quant_format] + qconfig = QUANT_ALGOS[self.quant_format] self.layout_type = qconfig["comfy_tensor_layout"] weight_scale_key = f"{prefix}weight_scale" + scale = state_dict.pop(weight_scale_key, None) layout_params = { - 'scale': state_dict.pop(weight_scale_key, None), + 'scale': scale, 'orig_dtype': MixedPrecisionOps._compute_dtype, 'block_size': qconfig.get("group_size", None), } - if layout_params['scale'] is not None: + + if scale is not None: manually_loaded_keys.append(weight_scale_key) self.weight = torch.nn.Parameter( - QuantizedTensor(weight.to(device=device), self.layout_type, layout_params), + QuantizedTensor(weight.to(device=device, dtype=qconfig.get("storage_t", None)), self.layout_type, layout_params), requires_grad=False ) @@ -624,7 +571,7 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful _v = state_dict.pop(param_key, None) if _v is None: continue - setattr(self, param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False)) + self.register_parameter(param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False)) manually_loaded_keys.append(param_key) super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) @@ -633,6 +580,16 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful if key in missing_keys: missing_keys.remove(key) + def state_dict(self, *args, destination=None, prefix="", **kwargs): + sd = super().state_dict(*args, destination=destination, prefix=prefix, **kwargs) + if isinstance(self.weight, QuantizedTensor): + sd["{}weight_scale".format(prefix)] = self.weight._layout_params['scale'] + quant_conf = {"format": self.quant_format} + if self._full_precision_mm: + quant_conf["full_precision_matrix_mult"] = True + sd["{}comfy_quant".format(prefix)] = torch.frombuffer(json.dumps(quant_conf).encode('utf-8'), dtype=torch.uint8) + return sd + def _forward(self, input, weight, bias): return torch.nn.functional.linear(input, weight, bias) @@ -648,9 +605,8 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful if self._full_precision_mm or self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(input, *args, **kwargs) if (getattr(self, 'layout_type', None) is not None and - getattr(self, 'input_scale', None) is not None and not isinstance(input, QuantizedTensor)): - input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, dtype=self.weight.dtype) + input = QuantizedTensor.from_float(input, self.layout_type, scale=getattr(self, 'input_scale', None), dtype=self.weight.dtype) return self._forward(input, self.weight, self.bias) def convert_weight(self, weight, inplace=False, **kwargs): @@ -661,7 +617,7 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs): if getattr(self, 'layout_type', None) is not None: - weight = QuantizedTensor.from_float(weight, self.layout_type, scale=None, dtype=self.weight.dtype, stochastic_rounding=seed, inplace_ops=True) + weight = QuantizedTensor.from_float(weight, self.layout_type, scale="recalculate", dtype=self.weight.dtype, stochastic_rounding=seed, inplace_ops=True) else: weight = weight.to(self.weight.dtype) if return_weight: @@ -670,17 +626,28 @@ def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, ful assert inplace_update is False # TODO: eventually remove the inplace_update stuff self.weight = torch.nn.Parameter(weight, requires_grad=False) + def _apply(self, fn, recurse=True): # This is to get torch.compile + moving weights to another device working + if recurse: + for module in self.children(): + module._apply(fn) + + for key, param in self._parameters.items(): + if param is None: + continue + self.register_parameter(key, torch.nn.Parameter(fn(param), requires_grad=False)) + for key, buf in self._buffers.items(): + if buf is not None: + self._buffers[key] = fn(buf) + return self + return MixedPrecisionOps -def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None, model_config=None): +def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, model_config=None): fp8_compute = comfy.model_management.supports_fp8_compute(load_device) # TODO: if we support more ops this needs to be more granular - if model_config and hasattr(model_config, 'layer_quant_config') and model_config.layer_quant_config: - logging.info(f"Using mixed precision operations: {len(model_config.layer_quant_config)} quantized layers") - return mixed_precision_ops(model_config.layer_quant_config, compute_dtype, full_precision_mm=not fp8_compute) - - if scaled_fp8 is not None: - return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8) + if model_config and hasattr(model_config, 'quant_config') and model_config.quant_config: + logging.info("Using mixed precision operations") + return mixed_precision_ops(model_config.quant_config, compute_dtype, full_precision_mm=not fp8_compute) if ( fp8_compute and diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index bb1fb860c..571d3f760 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -238,6 +238,9 @@ class QuantizedTensor(torch.Tensor): def is_contiguous(self, *arg, **kwargs): return self._qdata.is_contiguous(*arg, **kwargs) + def storage(self): + return self._qdata.storage() + # ============================================================================== # Generic Utilities (Layout-Agnostic Operations) # ============================================================================== @@ -249,12 +252,6 @@ def _create_transformed_qtensor(qt, transform_fn): def _handle_device_transfer(qt, target_device, target_dtype=None, target_layout=None, op_name="to"): - if target_dtype is not None and target_dtype != qt.dtype: - logging.warning( - f"QuantizedTensor: dtype conversion requested to {target_dtype}, " - f"but not supported for quantized tensors. Ignoring dtype." - ) - if target_layout is not None and target_layout != torch.strided: logging.warning( f"QuantizedTensor: layout change requested to {target_layout}, " @@ -274,6 +271,8 @@ def _handle_device_transfer(qt, target_device, target_dtype=None, target_layout= logging.debug(f"QuantizedTensor.{op_name}: Moving from {current_device} to {target_device}") new_q_data = qt._qdata.to(device=target_device) new_params = _move_layout_params_to_device(qt._layout_params, target_device) + if target_dtype is not None: + new_params["orig_dtype"] = target_dtype new_qt = QuantizedTensor(new_q_data, qt._layout_type, new_params) logging.debug(f"QuantizedTensor.{op_name}: Created new tensor on {target_device}") return new_qt @@ -339,7 +338,9 @@ def generic_copy_(func, args, kwargs): # Copy from another quantized tensor qt_dest._qdata.copy_(src._qdata, non_blocking=non_blocking) qt_dest._layout_type = src._layout_type + orig_dtype = qt_dest._layout_params["orig_dtype"] _copy_layout_params_inplace(src._layout_params, qt_dest._layout_params, non_blocking=non_blocking) + qt_dest._layout_params["orig_dtype"] = orig_dtype else: # Copy from regular tensor - just copy raw data qt_dest._qdata.copy_(src) @@ -397,17 +398,20 @@ class TensorCoreFP8Layout(QuantizedLayout): def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn, stochastic_rounding=0, inplace_ops=False): orig_dtype = tensor.dtype - if scale is None: + if isinstance(scale, str) and scale == "recalculate": scale = torch.amax(tensor.abs()) / torch.finfo(dtype).max - if not isinstance(scale, torch.Tensor): - scale = torch.tensor(scale) - scale = scale.to(device=tensor.device, dtype=torch.float32) + if scale is not None: + if not isinstance(scale, torch.Tensor): + scale = torch.tensor(scale) + scale = scale.to(device=tensor.device, dtype=torch.float32) - if inplace_ops: - tensor *= (1.0 / scale).to(tensor.dtype) + if inplace_ops: + tensor *= (1.0 / scale).to(tensor.dtype) + else: + tensor = tensor * (1.0 / scale).to(tensor.dtype) else: - tensor = tensor * (1.0 / scale).to(tensor.dtype) + scale = torch.ones((), device=tensor.device, dtype=torch.float32) if stochastic_rounding > 0: tensor = comfy.float.stochastic_rounding(tensor, dtype=dtype, seed=stochastic_rounding) diff --git a/comfy/sd.py b/comfy/sd.py index 03bdb33d5..092715d79 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -968,10 +968,8 @@ def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DI clip_data = [] for p in ckpt_paths: sd, metadata = comfy.utils.load_torch_file(p, safe_load=True, return_metadata=True) - if metadata is not None: - quant_metadata = metadata.get("_quantization_metadata", None) - if quant_metadata is not None: - sd["_quantization_metadata"] = quant_metadata + if model_options.get("custom_operations", None) is None: + sd, metadata = comfy.utils.convert_old_quants(sd, model_prefix="", metadata=metadata) clip_data.append(sd) return load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options) @@ -1088,7 +1086,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=True, t5=False) clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer elif clip_type == CLIPType.HIDREAM: - clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=False, clip_g=True, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None) + clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=False, clip_g=True, t5=False, llama=False, dtype_t5=None, dtype_llama=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer else: clip_target.clip = sdxl_clip.SDXLRefinerClipModel @@ -1112,7 +1110,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) elif clip_type == CLIPType.HIDREAM: clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data), - clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None) + clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer else: #CLIPType.MOCHI clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data)) @@ -1141,7 +1139,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) elif te_model == TEModel.LLAMA3_8: clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**llama_detect(clip_data), - clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None) + clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer elif te_model == TEModel.QWEN25_3B: clip_target.clip = comfy.text_encoders.omnigen2.te(**llama_detect(clip_data)) @@ -1169,7 +1167,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False) clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer elif clip_type == CLIPType.HIDREAM: - clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None) + clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer else: clip_target.clip = sd1_clip.SD1ClipModel @@ -1224,8 +1222,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip parameters = 0 for c in clip_data: - if "_quantization_metadata" in c: - c.pop("_quantization_metadata") parameters += comfy.utils.calculate_parameters(c) tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options) @@ -1295,6 +1291,10 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix) load_device = model_management.get_torch_device() + custom_operations = model_options.get("custom_operations", None) + if custom_operations is None: + sd, metadata = comfy.utils.convert_old_quants(sd, diffusion_model_prefix, metadata=metadata) + model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix, metadata=metadata) if model_config is None: logging.warning("Warning, This is not a checkpoint file, trying to load it as a diffusion model only.") @@ -1303,18 +1303,22 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c return None return (diffusion_model, None, VAE(sd={}), None) # The VAE object is there to throw an exception if it's actually used' - unet_weight_dtype = list(model_config.supported_inference_dtypes) - if model_config.scaled_fp8 is not None: + if model_config.quant_config is not None: weight_dtype = None - model_config.custom_operations = model_options.get("custom_operations", None) + if custom_operations is not None: + model_config.custom_operations = custom_operations + unet_dtype = model_options.get("dtype", model_options.get("weight_dtype", None)) if unet_dtype is None: unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype, weight_dtype=weight_dtype) - manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes) + if model_config.quant_config is not None: + manual_cast_dtype = model_management.unet_manual_cast(None, load_device, model_config.supported_inference_dtypes) + else: + manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes) model_config.set_inference_dtype(unet_dtype, manual_cast_dtype) if model_config.clip_vision_prefix is not None: @@ -1332,6 +1336,27 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c vae = VAE(sd=vae_sd, metadata=metadata) if output_clip: + if te_model_options.get("custom_operations", None) is None: + scaled_fp8_list = [] + for k in list(sd.keys()): # Convert scaled fp8 to mixed ops + if k.endswith(".scaled_fp8"): + scaled_fp8_list.append(k[:-len("scaled_fp8")]) + + if len(scaled_fp8_list) > 0: + out_sd = {} + for k in sd: + skip = False + for pref in scaled_fp8_list: + skip = skip or k.startswith(pref) + if not skip: + out_sd[k] = sd[k] + + for pref in scaled_fp8_list: + quant_sd, qmetadata = comfy.utils.convert_old_quants(sd, pref, metadata={}) + for k in quant_sd: + out_sd[k] = quant_sd[k] + sd = out_sd + clip_target = model_config.clip_target(state_dict=sd) if clip_target is not None: clip_sd = model_config.process_clip_state_dict(sd) @@ -1394,6 +1419,9 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None): if len(temp_sd) > 0: sd = temp_sd + custom_operations = model_options.get("custom_operations", None) + if custom_operations is None: + sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata) parameters = comfy.utils.calculate_parameters(sd) weight_dtype = comfy.utils.weight_dtype(sd) @@ -1424,7 +1452,7 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None): offload_device = model_management.unet_offload_device() unet_weight_dtype = list(model_config.supported_inference_dtypes) - if model_config.scaled_fp8 is not None: + if model_config.quant_config is not None: weight_dtype = None if dtype is None: @@ -1432,12 +1460,15 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None): else: unet_dtype = dtype - if model_config.layer_quant_config is not None: + if model_config.quant_config is not None: manual_cast_dtype = model_management.unet_manual_cast(None, load_device, model_config.supported_inference_dtypes) else: manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes) model_config.set_inference_dtype(unet_dtype, manual_cast_dtype) - model_config.custom_operations = model_options.get("custom_operations", model_config.custom_operations) + + if custom_operations is not None: + model_config.custom_operations = custom_operations + if model_options.get("fp8_optimizations", False): model_config.optimizations["fp8"] = True @@ -1476,6 +1507,9 @@ def save_checkpoint(output_path, model, clip=None, vae=None, clip_vision=None, m if vae is not None: vae_sd = vae.get_sd() + if metadata is None: + metadata = {} + model_management.load_models_gpu(load_models, force_patch_weights=True) clip_vision_sd = clip_vision.get_sd() if clip_vision is not None else None sd = model.model.state_dict_for_saving(clip_sd, vae_sd, clip_vision_sd) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 503a51843..962948dae 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -107,29 +107,17 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): config[k] = v operations = model_options.get("custom_operations", None) - scaled_fp8 = None - quantization_metadata = model_options.get("quantization_metadata", None) + quant_config = model_options.get("quantization_metadata", None) if operations is None: - layer_quant_config = None - if quantization_metadata is not None: - layer_quant_config = json.loads(quantization_metadata).get("layers", None) - - if layer_quant_config is not None: - operations = comfy.ops.mixed_precision_ops(layer_quant_config, dtype, full_precision_mm=True) - logging.info(f"Using MixedPrecisionOps for text encoder: {len(layer_quant_config)} quantized layers") + if quant_config is not None: + operations = comfy.ops.mixed_precision_ops(quant_config, dtype, full_precision_mm=True) + logging.info("Using MixedPrecisionOps for text encoder") else: - # Fallback to scaled_fp8_ops for backward compatibility - scaled_fp8 = model_options.get("scaled_fp8", None) - if scaled_fp8 is not None: - operations = comfy.ops.scaled_fp8_ops(fp8_matrix_mult=False, override_dtype=scaled_fp8) - else: - operations = comfy.ops.manual_cast + operations = comfy.ops.manual_cast self.operations = operations self.transformer = model_class(config, dtype, device, self.operations) - if scaled_fp8 is not None: - self.transformer.scaled_fp8 = torch.nn.Parameter(torch.tensor([], dtype=scaled_fp8)) self.num_layers = self.transformer.num_layers diff --git a/comfy/supported_models_base.py b/comfy/supported_models_base.py index e4bd74514..9fd84d329 100644 --- a/comfy/supported_models_base.py +++ b/comfy/supported_models_base.py @@ -49,8 +49,7 @@ class BASE: manual_cast_dtype = None custom_operations = None - scaled_fp8 = None - layer_quant_config = None # Per-layer quantization configuration for mixed precision + quant_config = None # quantization configuration for mixed precision optimizations = {"fp8": False} @classmethod diff --git a/comfy/text_encoders/cosmos.py b/comfy/text_encoders/cosmos.py index a1adb5242..448381fa9 100644 --- a/comfy/text_encoders/cosmos.py +++ b/comfy/text_encoders/cosmos.py @@ -7,10 +7,10 @@ from transformers import T5TokenizerFast class T5XXLModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}): textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_old_config_xxl.json") - t5xxl_scaled_fp8 = model_options.get("t5xxl_scaled_fp8", None) - if t5xxl_scaled_fp8 is not None: + t5xxl_quantization_metadata = model_options.get("t5xxl_quantization_metadata", None) + if t5xxl_quantization_metadata is not None: model_options = model_options.copy() - model_options["scaled_fp8"] = t5xxl_scaled_fp8 + model_options["quantization_metadata"] = t5xxl_quantization_metadata super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, zero_out_masked=attention_mask, model_options=model_options) @@ -30,12 +30,12 @@ class CosmosT5Tokenizer(sd1_clip.SD1Tokenizer): super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer) -def te(dtype_t5=None, t5xxl_scaled_fp8=None): +def te(dtype_t5=None, t5_quantization_metadata=None): class CosmosTEModel_(CosmosT5XXL): def __init__(self, device="cpu", dtype=None, model_options={}): - if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options: + if t5_quantization_metadata is not None: model_options = model_options.copy() - model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 + model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata if dtype is None: dtype = dtype_t5 super().__init__(device=device, dtype=dtype, model_options=model_options) diff --git a/comfy/text_encoders/flux.py b/comfy/text_encoders/flux.py index 99f4812bb..21d93d757 100644 --- a/comfy/text_encoders/flux.py +++ b/comfy/text_encoders/flux.py @@ -63,12 +63,12 @@ class FluxClipModel(torch.nn.Module): else: return self.t5xxl.load_sd(sd) -def flux_clip(dtype_t5=None, t5xxl_scaled_fp8=None): +def flux_clip(dtype_t5=None, t5_quantization_metadata=None): class FluxClipModel_(FluxClipModel): def __init__(self, device="cpu", dtype=None, model_options={}): - if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options: + if t5_quantization_metadata is not None: model_options = model_options.copy() - model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 + model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata super().__init__(dtype_t5=dtype_t5, device=device, dtype=dtype, model_options=model_options) return FluxClipModel_ @@ -159,15 +159,13 @@ class Flux2TEModel(sd1_clip.SD1ClipModel): out = out.reshape(out.shape[0], out.shape[1], -1) return out, pooled, extra -def flux2_te(dtype_llama=None, llama_scaled_fp8=None, llama_quantization_metadata=None, pruned=False): +def flux2_te(dtype_llama=None, llama_quantization_metadata=None, pruned=False): class Flux2TEModel_(Flux2TEModel): def __init__(self, device="cpu", dtype=None, model_options={}): - if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: - model_options = model_options.copy() - model_options["scaled_fp8"] = llama_scaled_fp8 if dtype_llama is not None: dtype = dtype_llama if llama_quantization_metadata is not None: + model_options = model_options.copy() model_options["quantization_metadata"] = llama_quantization_metadata if pruned: model_options = model_options.copy() diff --git a/comfy/text_encoders/genmo.py b/comfy/text_encoders/genmo.py index 9dcf190a2..5daea8135 100644 --- a/comfy/text_encoders/genmo.py +++ b/comfy/text_encoders/genmo.py @@ -26,12 +26,12 @@ class MochiT5Tokenizer(sd1_clip.SD1Tokenizer): super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer) -def mochi_te(dtype_t5=None, t5xxl_scaled_fp8=None): +def mochi_te(dtype_t5=None, t5_quantization_metadata=None): class MochiTEModel_(MochiT5XXL): def __init__(self, device="cpu", dtype=None, model_options={}): - if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options: + if t5_quantization_metadata is not None: model_options = model_options.copy() - model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 + model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata if dtype is None: dtype = dtype_t5 super().__init__(device=device, dtype=dtype, model_options=model_options) diff --git a/comfy/text_encoders/hidream.py b/comfy/text_encoders/hidream.py index dbcf52784..600b34480 100644 --- a/comfy/text_encoders/hidream.py +++ b/comfy/text_encoders/hidream.py @@ -142,14 +142,14 @@ class HiDreamTEModel(torch.nn.Module): return self.llama.load_sd(sd) -def hidream_clip(clip_l=True, clip_g=True, t5=True, llama=True, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None): +def hidream_clip(clip_l=True, clip_g=True, t5=True, llama=True, dtype_t5=None, dtype_llama=None, t5_quantization_metadata=None, llama_quantization_metadata=None): class HiDreamTEModel_(HiDreamTEModel): def __init__(self, device="cpu", dtype=None, model_options={}): - if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options: + if t5_quantization_metadata is not None: model_options = model_options.copy() - model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 - if llama_scaled_fp8 is not None and "llama_scaled_fp8" not in model_options: + model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata + if llama_quantization_metadata is not None: model_options = model_options.copy() - model_options["llama_scaled_fp8"] = llama_scaled_fp8 + model_options["llama_quantization_metadata"] = llama_quantization_metadata super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, llama=llama, dtype_t5=dtype_t5, dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options) return HiDreamTEModel_ diff --git a/comfy/text_encoders/hunyuan_image.py b/comfy/text_encoders/hunyuan_image.py index ff04726e1..cd198036c 100644 --- a/comfy/text_encoders/hunyuan_image.py +++ b/comfy/text_encoders/hunyuan_image.py @@ -40,10 +40,10 @@ class HunyuanImageTokenizer(QwenImageTokenizer): class Qwen25_7BVLIModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options={}): - llama_scaled_fp8 = model_options.get("qwen_scaled_fp8", None) - if llama_scaled_fp8 is not None: + llama_quantization_metadata = model_options.get("llama_quantization_metadata", None) + if llama_quantization_metadata is not None: model_options = model_options.copy() - model_options["scaled_fp8"] = llama_scaled_fp8 + model_options["quantization_metadata"] = llama_quantization_metadata super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) @@ -91,12 +91,12 @@ class HunyuanImageTEModel(QwenImageTEModel): else: return super().load_sd(sd) -def te(byt5=True, dtype_llama=None, llama_scaled_fp8=None): +def te(byt5=True, dtype_llama=None, llama_quantization_metadata=None): class QwenImageTEModel_(HunyuanImageTEModel): def __init__(self, device="cpu", dtype=None, model_options={}): - if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: + if llama_quantization_metadata is not None: model_options = model_options.copy() - model_options["qwen_scaled_fp8"] = llama_scaled_fp8 + model_options["llama_quantization_metadata"] = llama_quantization_metadata if dtype_llama is not None: dtype = dtype_llama super().__init__(byt5=byt5, device=device, dtype=dtype, model_options=model_options) diff --git a/comfy/text_encoders/hunyuan_video.py b/comfy/text_encoders/hunyuan_video.py index 0110517bb..a9a6c525e 100644 --- a/comfy/text_encoders/hunyuan_video.py +++ b/comfy/text_encoders/hunyuan_video.py @@ -6,7 +6,7 @@ from transformers import LlamaTokenizerFast import torch import os import numbers - +import comfy.utils def llama_detect(state_dict, prefix=""): out = {} @@ -14,12 +14,9 @@ def llama_detect(state_dict, prefix=""): if t5_key in state_dict: out["dtype_llama"] = state_dict[t5_key].dtype - scaled_fp8_key = "{}scaled_fp8".format(prefix) - if scaled_fp8_key in state_dict: - out["llama_scaled_fp8"] = state_dict[scaled_fp8_key].dtype - - if "_quantization_metadata" in state_dict: - out["llama_quantization_metadata"] = state_dict["_quantization_metadata"] + quant = comfy.utils.detect_layer_quantization(state_dict, prefix) + if quant is not None: + out["llama_quantization_metadata"] = quant return out @@ -31,10 +28,10 @@ class LLAMA3Tokenizer(sd1_clip.SDTokenizer): class LLAMAModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options={}, special_tokens={"start": 128000, "pad": 128258}): - llama_scaled_fp8 = model_options.get("llama_scaled_fp8", None) - if llama_scaled_fp8 is not None: + llama_quantization_metadata = model_options.get("llama_quantization_metadata", None) + if llama_quantization_metadata is not None: model_options = model_options.copy() - model_options["scaled_fp8"] = llama_scaled_fp8 + model_options["quantization_metadata"] = llama_quantization_metadata textmodel_json_config = {} vocab_size = model_options.get("vocab_size", None) @@ -161,11 +158,11 @@ class HunyuanVideoClipModel(torch.nn.Module): return self.llama.load_sd(sd) -def hunyuan_video_clip(dtype_llama=None, llama_scaled_fp8=None): +def hunyuan_video_clip(dtype_llama=None, llama_quantization_metadata=None): class HunyuanVideoClipModel_(HunyuanVideoClipModel): def __init__(self, device="cpu", dtype=None, model_options={}): - if llama_scaled_fp8 is not None and "llama_scaled_fp8" not in model_options: + if llama_quantization_metadata is not None: model_options = model_options.copy() - model_options["llama_scaled_fp8"] = llama_scaled_fp8 + model_options["llama_quantization_metadata"] = llama_quantization_metadata super().__init__(dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options) return HunyuanVideoClipModel_ diff --git a/comfy/text_encoders/lumina2.py b/comfy/text_encoders/lumina2.py index fd986e2c1..7a6cfdab2 100644 --- a/comfy/text_encoders/lumina2.py +++ b/comfy/text_encoders/lumina2.py @@ -40,7 +40,7 @@ class LuminaModel(sd1_clip.SD1ClipModel): super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options) -def te(dtype_llama=None, llama_scaled_fp8=None, model_type="gemma2_2b"): +def te(dtype_llama=None, llama_quantization_metadata=None, model_type="gemma2_2b"): if model_type == "gemma2_2b": model = Gemma2_2BModel elif model_type == "gemma3_4b": @@ -48,9 +48,9 @@ def te(dtype_llama=None, llama_scaled_fp8=None, model_type="gemma2_2b"): class LuminaTEModel_(LuminaModel): def __init__(self, device="cpu", dtype=None, model_options={}): - if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: + if llama_quantization_metadata is not None: model_options = model_options.copy() - model_options["scaled_fp8"] = llama_scaled_fp8 + model_options["quantization_metadata"] = llama_quantization_metadata if dtype_llama is not None: dtype = dtype_llama super().__init__(device=device, dtype=dtype, name=model_type, model_options=model_options, clip_model=model) diff --git a/comfy/text_encoders/omnigen2.py b/comfy/text_encoders/omnigen2.py index 1a01b2dd4..50aa4121f 100644 --- a/comfy/text_encoders/omnigen2.py +++ b/comfy/text_encoders/omnigen2.py @@ -32,12 +32,12 @@ class Omnigen2Model(sd1_clip.SD1ClipModel): super().__init__(device=device, dtype=dtype, name="qwen25_3b", clip_model=Qwen25_3BModel, model_options=model_options) -def te(dtype_llama=None, llama_scaled_fp8=None): +def te(dtype_llama=None, llama_quantization_metadata=None): class Omnigen2TEModel_(Omnigen2Model): def __init__(self, device="cpu", dtype=None, model_options={}): - if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: + if llama_quantization_metadata is not None: model_options = model_options.copy() - model_options["scaled_fp8"] = llama_scaled_fp8 + model_options["quantization_metadata"] = llama_quantization_metadata if dtype_llama is not None: dtype = dtype_llama super().__init__(device=device, dtype=dtype, model_options=model_options) diff --git a/comfy/text_encoders/ovis.py b/comfy/text_encoders/ovis.py index 81c9bd51c..5754424d2 100644 --- a/comfy/text_encoders/ovis.py +++ b/comfy/text_encoders/ovis.py @@ -55,12 +55,9 @@ class OvisTEModel(sd1_clip.SD1ClipModel): return out, pooled, {} -def te(dtype_llama=None, llama_scaled_fp8=None, llama_quantization_metadata=None): +def te(dtype_llama=None, llama_quantization_metadata=None): class OvisTEModel_(OvisTEModel): def __init__(self, device="cpu", dtype=None, model_options={}): - if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: - model_options = model_options.copy() - model_options["scaled_fp8"] = llama_scaled_fp8 if dtype_llama is not None: dtype = dtype_llama if llama_quantization_metadata is not None: diff --git a/comfy/text_encoders/pixart_t5.py b/comfy/text_encoders/pixart_t5.py index 5f383de07..e5e5f18be 100644 --- a/comfy/text_encoders/pixart_t5.py +++ b/comfy/text_encoders/pixart_t5.py @@ -30,12 +30,12 @@ class PixArtTokenizer(sd1_clip.SD1Tokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer) -def pixart_te(dtype_t5=None, t5xxl_scaled_fp8=None): +def pixart_te(dtype_t5=None, t5_quantization_metadata=None): class PixArtTEModel_(PixArtT5XXL): def __init__(self, device="cpu", dtype=None, model_options={}): - if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options: + if t5_quantization_metadata is not None: model_options = model_options.copy() - model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 + model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata if dtype is None: dtype = dtype_t5 super().__init__(device=device, dtype=dtype, model_options=model_options) diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py index c0d32a6ef..5c14dec23 100644 --- a/comfy/text_encoders/qwen_image.py +++ b/comfy/text_encoders/qwen_image.py @@ -85,12 +85,12 @@ class QwenImageTEModel(sd1_clip.SD1ClipModel): return out, pooled, extra -def te(dtype_llama=None, llama_scaled_fp8=None): +def te(dtype_llama=None, llama_quantization_metadata=None): class QwenImageTEModel_(QwenImageTEModel): def __init__(self, device="cpu", dtype=None, model_options={}): - if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: + if llama_quantization_metadata is not None: model_options = model_options.copy() - model_options["scaled_fp8"] = llama_scaled_fp8 + model_options["quantization_metadata"] = llama_quantization_metadata if dtype_llama is not None: dtype = dtype_llama super().__init__(device=device, dtype=dtype, model_options=model_options) diff --git a/comfy/text_encoders/sd3_clip.py b/comfy/text_encoders/sd3_clip.py index ff5d412db..8b153c72b 100644 --- a/comfy/text_encoders/sd3_clip.py +++ b/comfy/text_encoders/sd3_clip.py @@ -6,14 +6,15 @@ import torch import os import comfy.model_management import logging +import comfy.utils class T5XXLModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=False, model_options={}): textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_xxl.json") - t5xxl_scaled_fp8 = model_options.get("t5xxl_scaled_fp8", None) - if t5xxl_scaled_fp8 is not None: + t5xxl_quantization_metadata = model_options.get("t5xxl_quantization_metadata", None) + if t5xxl_quantization_metadata is not None: model_options = model_options.copy() - model_options["scaled_fp8"] = t5xxl_scaled_fp8 + model_options["quantization_metadata"] = t5xxl_quantization_metadata model_options = {**model_options, "model_name": "t5xxl"} super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) @@ -25,9 +26,9 @@ def t5_xxl_detect(state_dict, prefix=""): if t5_key in state_dict: out["dtype_t5"] = state_dict[t5_key].dtype - scaled_fp8_key = "{}scaled_fp8".format(prefix) - if scaled_fp8_key in state_dict: - out["t5xxl_scaled_fp8"] = state_dict[scaled_fp8_key].dtype + quant = comfy.utils.detect_layer_quantization(state_dict, prefix) + if quant is not None: + out["t5_quantization_metadata"] = quant return out @@ -156,11 +157,11 @@ class SD3ClipModel(torch.nn.Module): else: return self.t5xxl.load_sd(sd) -def sd3_clip(clip_l=True, clip_g=True, t5=True, dtype_t5=None, t5xxl_scaled_fp8=None, t5_attention_mask=False): +def sd3_clip(clip_l=True, clip_g=True, t5=True, dtype_t5=None, t5_quantization_metadata=None, t5_attention_mask=False): class SD3ClipModel_(SD3ClipModel): def __init__(self, device="cpu", dtype=None, model_options={}): - if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options: + if t5_quantization_metadata is not None: model_options = model_options.copy() - model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 + model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, dtype_t5=dtype_t5, t5_attention_mask=t5_attention_mask, device=device, dtype=dtype, model_options=model_options) return SD3ClipModel_ diff --git a/comfy/text_encoders/wan.py b/comfy/text_encoders/wan.py index d50fa4b28..164a57edd 100644 --- a/comfy/text_encoders/wan.py +++ b/comfy/text_encoders/wan.py @@ -25,12 +25,12 @@ class WanT5Model(sd1_clip.SD1ClipModel): def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs): super().__init__(device=device, dtype=dtype, model_options=model_options, name="umt5xxl", clip_model=UMT5XXlModel, **kwargs) -def te(dtype_t5=None, t5xxl_scaled_fp8=None): +def te(dtype_t5=None, t5_quantization_metadata=None): class WanTEModel(WanT5Model): def __init__(self, device="cpu", dtype=None, model_options={}): - if t5xxl_scaled_fp8 is not None and "scaled_fp8" not in model_options: + if t5_quantization_metadata is not None: model_options = model_options.copy() - model_options["scaled_fp8"] = t5xxl_scaled_fp8 + model_options["quantization_metadata"] = t5_quantization_metadata if dtype_t5 is not None: dtype = dtype_t5 super().__init__(device=device, dtype=dtype, model_options=model_options) diff --git a/comfy/text_encoders/z_image.py b/comfy/text_encoders/z_image.py index bb9273b20..19adde0b7 100644 --- a/comfy/text_encoders/z_image.py +++ b/comfy/text_encoders/z_image.py @@ -34,12 +34,9 @@ class ZImageTEModel(sd1_clip.SD1ClipModel): super().__init__(device=device, dtype=dtype, name="qwen3_4b", clip_model=Qwen3_4BModel, model_options=model_options) -def te(dtype_llama=None, llama_scaled_fp8=None, llama_quantization_metadata=None): +def te(dtype_llama=None, llama_quantization_metadata=None): class ZImageTEModel_(ZImageTEModel): def __init__(self, device="cpu", dtype=None, model_options={}): - if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: - model_options = model_options.copy() - model_options["scaled_fp8"] = llama_scaled_fp8 if dtype_llama is not None: dtype = dtype_llama if llama_quantization_metadata is not None: diff --git a/comfy/utils.py b/comfy/utils.py index 37485e497..89846bc95 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -29,6 +29,7 @@ import itertools from torch.nn.functional import interpolate from einops import rearrange from comfy.cli_args import args +import json MMAP_TORCH_FILES = args.mmap_torch_files DISABLE_MMAP = args.disable_mmap @@ -1194,3 +1195,68 @@ def unpack_latents(combined_latent, latent_shapes): else: output_tensors = combined_latent return output_tensors + +def detect_layer_quantization(state_dict, prefix): + for k in state_dict: + if k.startswith(prefix) and k.endswith(".comfy_quant"): + logging.info("Found quantization metadata version 1") + return {"mixed_ops": True} + return None + +def convert_old_quants(state_dict, model_prefix="", metadata={}): + if metadata is None: + metadata = {} + + quant_metadata = None + if "_quantization_metadata" not in metadata: + scaled_fp8_key = "{}scaled_fp8".format(model_prefix) + + if scaled_fp8_key in state_dict: + scaled_fp8_weight = state_dict[scaled_fp8_key] + scaled_fp8_dtype = scaled_fp8_weight.dtype + if scaled_fp8_dtype == torch.float32: + scaled_fp8_dtype = torch.float8_e4m3fn + + if scaled_fp8_weight.nelement() == 2: + full_precision_matrix_mult = True + else: + full_precision_matrix_mult = False + + out_sd = {} + layers = {} + for k in list(state_dict.keys()): + if not k.startswith(model_prefix): + out_sd[k] = state_dict[k] + continue + k_out = k + w = state_dict.pop(k) + layer = None + if k_out.endswith(".scale_weight"): + layer = k_out[:-len(".scale_weight")] + k_out = "{}.weight_scale".format(layer) + + if layer is not None: + layer_conf = {"format": "float8_e4m3fn"} # TODO: check if anyone did some non e4m3fn scaled checkpoints + if full_precision_matrix_mult: + layer_conf["full_precision_matrix_mult"] = full_precision_matrix_mult + layers[layer] = layer_conf + + if k_out.endswith(".scale_input"): + layer = k_out[:-len(".scale_input")] + k_out = "{}.input_scale".format(layer) + if w.item() == 1.0: + continue + + out_sd[k_out] = w + + state_dict = out_sd + quant_metadata = {"layers": layers} + else: + quant_metadata = json.loads(metadata["_quantization_metadata"]) + + if quant_metadata is not None: + layers = quant_metadata["layers"] + for k, v in layers.items(): + state_dict["{}.comfy_quant".format(k)] = torch.frombuffer(json.dumps(v).encode('utf-8'), dtype=torch.uint8) + + return state_dict, metadata diff --git a/tests-unit/comfy_quant/test_mixed_precision.py b/tests-unit/comfy_quant/test_mixed_precision.py index 63361309f..3a54941e6 100644 --- a/tests-unit/comfy_quant/test_mixed_precision.py +++ b/tests-unit/comfy_quant/test_mixed_precision.py @@ -2,6 +2,7 @@ import unittest import torch import sys import os +import json # Add comfy to path sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) @@ -15,6 +16,7 @@ if not has_gpu(): from comfy import ops from comfy.quant_ops import QuantizedTensor +import comfy.utils class SimpleModel(torch.nn.Module): @@ -94,8 +96,9 @@ class TestMixedPrecisionOps(unittest.TestCase): "layer3.weight_scale": torch.tensor(1.5, dtype=torch.float32), } + state_dict, _ = comfy.utils.convert_old_quants(state_dict, metadata={"_quantization_metadata": json.dumps({"layers": layer_quant_config})}) # Create model and load state dict (strict=False because custom loading pops keys) - model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config)) + model = SimpleModel(operations=ops.mixed_precision_ops({})) model.load_state_dict(state_dict, strict=False) # Verify weights are wrapped in QuantizedTensor @@ -115,7 +118,8 @@ class TestMixedPrecisionOps(unittest.TestCase): # Forward pass input_tensor = torch.randn(5, 10, dtype=torch.bfloat16) - output = model(input_tensor) + with torch.inference_mode(): + output = model(input_tensor) self.assertEqual(output.shape, (5, 40)) @@ -141,7 +145,8 @@ class TestMixedPrecisionOps(unittest.TestCase): "layer3.bias": torch.randn(40, dtype=torch.bfloat16), } - model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config)) + state_dict1, _ = comfy.utils.convert_old_quants(state_dict1, metadata={"_quantization_metadata": json.dumps({"layers": layer_quant_config})}) + model = SimpleModel(operations=ops.mixed_precision_ops({})) model.load_state_dict(state_dict1, strict=False) # Save state dict @@ -178,7 +183,8 @@ class TestMixedPrecisionOps(unittest.TestCase): "layer3.bias": torch.randn(40, dtype=torch.bfloat16), } - model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config)) + state_dict, _ = comfy.utils.convert_old_quants(state_dict, metadata={"_quantization_metadata": json.dumps({"layers": layer_quant_config})}) + model = SimpleModel(operations=ops.mixed_precision_ops({})) model.load_state_dict(state_dict, strict=False) # Add a weight function (simulating LoRA) @@ -215,8 +221,10 @@ class TestMixedPrecisionOps(unittest.TestCase): "layer3.bias": torch.randn(40, dtype=torch.bfloat16), } + state_dict, _ = comfy.utils.convert_old_quants(state_dict, metadata={"_quantization_metadata": json.dumps({"layers": layer_quant_config})}) + # Load should raise KeyError for unknown format in QUANT_FORMAT_MIXINS - model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config)) + model = SimpleModel(operations=ops.mixed_precision_ops({})) with self.assertRaises(KeyError): model.load_state_dict(state_dict, strict=False) From 6fd463aec958f02be79a264eafd6c8fe7e52762a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 5 Dec 2025 12:33:16 -0800 Subject: [PATCH 689/759] Fix regression when text encoder loaded directly on GPU. (#11129) --- comfy/ops.py | 2 ++ comfy/sd.py | 44 ++++++++++++++++++++++++-------------------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index dc06709a1..35237c9f7 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -552,6 +552,8 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec weight_scale_key = f"{prefix}weight_scale" scale = state_dict.pop(weight_scale_key, None) + if scale is not None: + scale = scale.to(device) layout_params = { 'scale': scale, 'orig_dtype': MixedPrecisionOps._compute_dtype, diff --git a/comfy/sd.py b/comfy/sd.py index 092715d79..c350322f8 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -98,7 +98,7 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip): class CLIP: - def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, model_options={}): + def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, state_dict=[], model_options={}): if no_init: return params = target.params.copy() @@ -129,6 +129,27 @@ class CLIP: self.patcher.hook_mode = comfy.hooks.EnumHookMode.MinVram self.patcher.is_clip = True self.apply_hooks_to_conds = None + if len(state_dict) > 0: + if isinstance(state_dict, list): + for c in state_dict: + m, u = self.load_sd(c) + if len(m) > 0: + logging.warning("clip missing: {}".format(m)) + + if len(u) > 0: + logging.debug("clip unexpected: {}".format(u)) + else: + m, u = self.load_sd(state_dict, full_model=True) + if len(m) > 0: + m_filter = list(filter(lambda a: ".logit_scale" not in a and ".transformer.text_projection.weight" not in a, m)) + if len(m_filter) > 0: + logging.warning("clip missing: {}".format(m)) + else: + logging.debug("clip missing: {}".format(m)) + + if len(u) > 0: + logging.debug("clip unexpected {}:".format(u)) + if params['device'] == load_device: model_management.load_models_gpu([self.patcher], force_full_load=True) self.layer_idx = None @@ -1225,14 +1246,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip parameters += comfy.utils.calculate_parameters(c) tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options) - clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options) - for c in clip_data: - m, u = clip.load_sd(c) - if len(m) > 0: - logging.warning("clip missing: {}".format(m)) - - if len(u) > 0: - logging.debug("clip unexpected: {}".format(u)) + clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, state_dict=clip_data, model_options=model_options) return clip def load_gligen(ckpt_path): @@ -1362,17 +1376,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c clip_sd = model_config.process_clip_state_dict(sd) if len(clip_sd) > 0: parameters = comfy.utils.calculate_parameters(clip_sd) - clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd, parameters=parameters, model_options=te_model_options) - m, u = clip.load_sd(clip_sd, full_model=True) - if len(m) > 0: - m_filter = list(filter(lambda a: ".logit_scale" not in a and ".transformer.text_projection.weight" not in a, m)) - if len(m_filter) > 0: - logging.warning("clip missing: {}".format(m)) - else: - logging.debug("clip missing: {}".format(m)) - - if len(u) > 0: - logging.debug("clip unexpected {}:".format(u)) + clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd, parameters=parameters, state_dict=clip_sd, model_options=te_model_options) else: logging.warning("no CLIP/text encoder weights in checkpoint, the text encoder model will not be loaded.") From 79d17ba2339aaf4f3422673b3dad24ba4dbd7552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Fri, 5 Dec 2025 22:42:46 +0200 Subject: [PATCH 690/759] Context windows fixes and features (#10975) * Apply cond slice fix * Add FreeNoise * Update context_windows.py * Add option to retain condition by indexes for each window This allows for example Wan/HunyuanVideo image to video to "work" by using the initial start frame for each window, otherwise windows beyond first will be pure T2V generations. * Update context_windows.py * Allow splitting multiple conds into different windows * Add handling for audio_embed * whitespace * Allow freenoise to work on other dims, handle 4D batch timestep Refactor Freenoise function. And fix batch handling as timesteps seem to be expanded to batch size now. * Disable experimental options for now So that the Freenoise and bugfixes can be merged first --------- Co-authored-by: Jedrzej Kosinski Co-authored-by: ozbayb <17261091+ozbayb@users.noreply.github.com> --- comfy/context_windows.py | 104 ++++++++++++++++++++++---- comfy_extras/nodes_context_windows.py | 22 +++++- 2 files changed, 108 insertions(+), 18 deletions(-) diff --git a/comfy/context_windows.py b/comfy/context_windows.py index 041f380f9..5c412d1c2 100644 --- a/comfy/context_windows.py +++ b/comfy/context_windows.py @@ -51,26 +51,36 @@ class ContextHandlerABC(ABC): class IndexListContextWindow(ContextWindowABC): - def __init__(self, index_list: list[int], dim: int=0): + def __init__(self, index_list: list[int], dim: int=0, total_frames: int=0): self.index_list = index_list self.context_length = len(index_list) self.dim = dim + self.total_frames = total_frames + self.center_ratio = (min(index_list) + max(index_list)) / (2 * total_frames) - def get_tensor(self, full: torch.Tensor, device=None, dim=None) -> torch.Tensor: + def get_tensor(self, full: torch.Tensor, device=None, dim=None, retain_index_list=[]) -> torch.Tensor: if dim is None: dim = self.dim if dim == 0 and full.shape[dim] == 1: return full - idx = [slice(None)] * dim + [self.index_list] - return full[idx].to(device) + idx = tuple([slice(None)] * dim + [self.index_list]) + window = full[idx] + if retain_index_list: + idx = tuple([slice(None)] * dim + [retain_index_list]) + window[idx] = full[idx] + return window.to(device) def add_window(self, full: torch.Tensor, to_add: torch.Tensor, dim=None) -> torch.Tensor: if dim is None: dim = self.dim - idx = [slice(None)] * dim + [self.index_list] + idx = tuple([slice(None)] * dim + [self.index_list]) full[idx] += to_add return full + def get_region_index(self, num_regions: int) -> int: + region_idx = int(self.center_ratio * num_regions) + return min(max(region_idx, 0), num_regions - 1) + class IndexListCallbacks: EVALUATE_CONTEXT_WINDOWS = "evaluate_context_windows" @@ -94,7 +104,8 @@ class ContextFuseMethod: ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window']) class IndexListContextHandler(ContextHandlerABC): - def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1, closed_loop=False, dim=0): + def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1, + closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False): self.context_schedule = context_schedule self.fuse_method = fuse_method self.context_length = context_length @@ -103,13 +114,18 @@ class IndexListContextHandler(ContextHandlerABC): self.closed_loop = closed_loop self.dim = dim self._step = 0 + self.freenoise = freenoise + self.cond_retain_index_list = [int(x.strip()) for x in cond_retain_index_list.split(",")] if cond_retain_index_list else [] + self.split_conds_to_windows = split_conds_to_windows self.callbacks = {} def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool: # for now, assume first dim is batch - should have stored on BaseModel in actual implementation if x_in.size(self.dim) > self.context_length: - logging.info(f"Using context windows {self.context_length} for {x_in.size(self.dim)} frames.") + logging.info(f"Using context windows {self.context_length} with overlap {self.context_overlap} for {x_in.size(self.dim)} frames.") + if self.cond_retain_index_list: + logging.info(f"Retaining original cond for indexes: {self.cond_retain_index_list}") return True return False @@ -123,6 +139,11 @@ class IndexListContextHandler(ContextHandlerABC): return None # reuse or resize cond items to match context requirements resized_cond = [] + # if multiple conds, split based on primary region + if self.split_conds_to_windows and len(cond_in) > 1: + region = window.get_region_index(len(cond_in)) + logging.info(f"Splitting conds to windows; using region {region} for window {window[0]}-{window[-1]} with center ratio {window.center_ratio:.3f}") + cond_in = [cond_in[region]] # cond object is a list containing a dict - outer list is irrelevant, so just loop through it for actual_cond in cond_in: resized_actual_cond = actual_cond.copy() @@ -146,12 +167,19 @@ class IndexListContextHandler(ContextHandlerABC): # when in dictionary, look for tensors and CONDCrossAttn [comfy/conds.py] (has cond attr that is a tensor) for cond_key, cond_value in new_cond_item.items(): if isinstance(cond_value, torch.Tensor): - if cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim): + if (self.dim < cond_value.ndim and cond_value(self.dim) == x_in.size(self.dim)) or \ + (cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim)): new_cond_item[cond_key] = window.get_tensor(cond_value, device) + # Handle audio_embed (temporal dim is 1) + elif cond_key == "audio_embed" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor): + audio_cond = cond_value.cond + if audio_cond.ndim > 1 and audio_cond.size(1) == x_in.size(self.dim): + new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(audio_cond, device, dim=1)) # if has cond that is a Tensor, check if needs to be subset elif hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor): - if cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim): - new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device)) + if (self.dim < cond_value.cond.ndim and cond_value.cond.size(self.dim) == x_in.size(self.dim)) or \ + (cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim)): + new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device, retain_index_list=self.cond_retain_index_list)) elif cond_key == "num_video_frames": # for SVD new_cond_item[cond_key] = cond_value._copy_with(cond_value.cond) new_cond_item[cond_key].cond = window.context_length @@ -164,7 +192,7 @@ class IndexListContextHandler(ContextHandlerABC): return resized_cond def set_step(self, timestep: torch.Tensor, model_options: dict[str]): - mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep, rtol=0.0001) + mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep[0], rtol=0.0001) matches = torch.nonzero(mask) if torch.numel(matches) == 0: raise Exception("No sample_sigmas matched current timestep; something went wrong.") @@ -173,7 +201,7 @@ class IndexListContextHandler(ContextHandlerABC): def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]: full_length = x_in.size(self.dim) # TODO: choose dim based on model context_windows = self.context_schedule.func(full_length, self, model_options) - context_windows = [IndexListContextWindow(window, dim=self.dim) for window in context_windows] + context_windows = [IndexListContextWindow(window, dim=self.dim, total_frames=full_length) for window in context_windows] return context_windows def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]): @@ -250,8 +278,8 @@ class IndexListContextHandler(ContextHandlerABC): prev_weight = (bias_total / (bias_total + bias)) new_weight = (bias / (bias_total + bias)) # account for dims of tensors - idx_window = [slice(None)] * self.dim + [idx] - pos_window = [slice(None)] * self.dim + [pos] + idx_window = tuple([slice(None)] * self.dim + [idx]) + pos_window = tuple([slice(None)] * self.dim + [pos]) # apply new values conds_final[i][idx_window] = conds_final[i][idx_window] * prev_weight + sub_conds_out[i][pos_window] * new_weight biases_final[i][idx] = bias_total + bias @@ -287,6 +315,28 @@ def create_prepare_sampling_wrapper(model: ModelPatcher): ) +def _sampler_sample_wrapper(executor, guider, sigmas, extra_args, callback, noise, *args, **kwargs): + model_options = extra_args.get("model_options", None) + if model_options is None: + raise Exception("model_options not found in sampler_sample_wrapper; this should never happen, something went wrong.") + handler: IndexListContextHandler = model_options.get("context_handler", None) + if handler is None: + raise Exception("context_handler not found in sampler_sample_wrapper; this should never happen, something went wrong.") + if not handler.freenoise: + return executor(guider, sigmas, extra_args, callback, noise, *args, **kwargs) + noise = apply_freenoise(noise, handler.dim, handler.context_length, handler.context_overlap, extra_args["seed"]) + + return executor(guider, sigmas, extra_args, callback, noise, *args, **kwargs) + + +def create_sampler_sample_wrapper(model: ModelPatcher): + model.add_wrapper_with_key( + comfy.patcher_extension.WrappersMP.SAMPLER_SAMPLE, + "ContextWindows_sampler_sample", + _sampler_sample_wrapper + ) + + def match_weights_to_dim(weights: list[float], x_in: torch.Tensor, dim: int, device=None) -> torch.Tensor: total_dims = len(x_in.shape) weights_tensor = torch.Tensor(weights).to(device=device) @@ -538,3 +588,29 @@ def shift_window_to_end(window: list[int], num_frames: int): for i in range(len(window)): # 2) add end_delta to each val to slide windows to end window[i] = window[i] + end_delta + + +# https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved/blob/90fb1331201a4b29488089e4fbffc0d82cc6d0a9/animatediff/sample_settings.py#L465 +def apply_freenoise(noise: torch.Tensor, dim: int, context_length: int, context_overlap: int, seed: int): + logging.info("Context windows: Applying FreeNoise") + generator = torch.Generator(device='cpu').manual_seed(seed) + latent_video_length = noise.shape[dim] + delta = context_length - context_overlap + + for start_idx in range(0, latent_video_length - context_length, delta): + place_idx = start_idx + context_length + + actual_delta = min(delta, latent_video_length - place_idx) + if actual_delta <= 0: + break + + list_idx = torch.randperm(actual_delta, generator=generator, device='cpu') + start_idx + + source_slice = [slice(None)] * noise.ndim + source_slice[dim] = list_idx + target_slice = [slice(None)] * noise.ndim + target_slice[dim] = slice(place_idx, place_idx + actual_delta) + + noise[tuple(target_slice)] = noise[tuple(source_slice)] + + return noise diff --git a/comfy_extras/nodes_context_windows.py b/comfy_extras/nodes_context_windows.py index 1c3d9e697..3799a9004 100644 --- a/comfy_extras/nodes_context_windows.py +++ b/comfy_extras/nodes_context_windows.py @@ -26,6 +26,9 @@ class ContextWindowsManualNode(io.ComfyNode): io.Boolean.Input("closed_loop", default=False, tooltip="Whether to close the context window loop; only applicable to looped schedules."), io.Combo.Input("fuse_method", options=comfy.context_windows.ContextFuseMethods.LIST_STATIC, default=comfy.context_windows.ContextFuseMethods.PYRAMID, tooltip="The method to use to fuse the context windows."), io.Int.Input("dim", min=0, max=5, default=0, tooltip="The dimension to apply the context windows to."), + io.Boolean.Input("freenoise", default=False, tooltip="Whether to apply FreeNoise noise shuffling, improves window blending."), + #io.String.Input("cond_retain_index_list", default="", tooltip="List of latent indices to retain in the conditioning tensors for each window, for example setting this to '0' will use the initial start image for each window."), + #io.Boolean.Input("split_conds_to_windows", default=False, tooltip="Whether to split multiple conditionings (created by ConditionCombine) to each window based on region index."), ], outputs=[ io.Model.Output(tooltip="The model with context windows applied during sampling."), @@ -34,7 +37,8 @@ class ContextWindowsManualNode(io.ComfyNode): ) @classmethod - def execute(cls, model: io.Model.Type, context_length: int, context_overlap: int, context_schedule: str, context_stride: int, closed_loop: bool, fuse_method: str, dim: int) -> io.Model: + def execute(cls, model: io.Model.Type, context_length: int, context_overlap: int, context_schedule: str, context_stride: int, closed_loop: bool, fuse_method: str, dim: int, freenoise: bool, + cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False) -> io.Model: model = model.clone() model.model_options["context_handler"] = comfy.context_windows.IndexListContextHandler( context_schedule=comfy.context_windows.get_matching_context_schedule(context_schedule), @@ -43,9 +47,15 @@ class ContextWindowsManualNode(io.ComfyNode): context_overlap=context_overlap, context_stride=context_stride, closed_loop=closed_loop, - dim=dim) + dim=dim, + freenoise=freenoise, + cond_retain_index_list=cond_retain_index_list, + split_conds_to_windows=split_conds_to_windows + ) # make memory usage calculation only take into account the context window latents comfy.context_windows.create_prepare_sampling_wrapper(model) + if freenoise: # no other use for this wrapper at this time + comfy.context_windows.create_sampler_sample_wrapper(model) return io.NodeOutput(model) class WanContextWindowsManualNode(ContextWindowsManualNode): @@ -68,14 +78,18 @@ class WanContextWindowsManualNode(ContextWindowsManualNode): io.Int.Input("context_stride", min=1, default=1, tooltip="The stride of the context window; only applicable to uniform schedules."), io.Boolean.Input("closed_loop", default=False, tooltip="Whether to close the context window loop; only applicable to looped schedules."), io.Combo.Input("fuse_method", options=comfy.context_windows.ContextFuseMethods.LIST_STATIC, default=comfy.context_windows.ContextFuseMethods.PYRAMID, tooltip="The method to use to fuse the context windows."), + io.Boolean.Input("freenoise", default=False, tooltip="Whether to apply FreeNoise noise shuffling, improves window blending."), + #io.String.Input("cond_retain_index_list", default="", tooltip="List of latent indices to retain in the conditioning tensors for each window, for example setting this to '0' will use the initial start image for each window."), + #io.Boolean.Input("split_conds_to_windows", default=False, tooltip="Whether to split multiple conditionings (created by ConditionCombine) to each window based on region index."), ] return schema @classmethod - def execute(cls, model: io.Model.Type, context_length: int, context_overlap: int, context_schedule: str, context_stride: int, closed_loop: bool, fuse_method: str) -> io.Model: + def execute(cls, model: io.Model.Type, context_length: int, context_overlap: int, context_schedule: str, context_stride: int, closed_loop: bool, fuse_method: str, freenoise: bool, + cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False) -> io.Model: context_length = max(((context_length - 1) // 4) + 1, 1) # at least length 1 context_overlap = max(((context_overlap - 1) // 4) + 1, 0) # at least overlap 0 - return super().execute(model, context_length, context_overlap, context_schedule, context_stride, closed_loop, fuse_method, dim=2) + return super().execute(model, context_length, context_overlap, context_schedule, context_stride, closed_loop, fuse_method, dim=2, freenoise=freenoise, cond_retain_index_list=cond_retain_index_list, split_conds_to_windows=split_conds_to_windows) class ContextWindowsExtension(ComfyExtension): From 092ee8a5008c8d558b0a72cc7961a31d9cc5400b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 5 Dec 2025 15:25:31 -0800 Subject: [PATCH 691/759] Fix some custom nodes. (#11134) --- comfy/supported_models_base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/comfy/supported_models_base.py b/comfy/supported_models_base.py index 9fd84d329..0e7a829ba 100644 --- a/comfy/supported_models_base.py +++ b/comfy/supported_models_base.py @@ -17,6 +17,7 @@ """ import torch +import logging from . import model_base from . import utils from . import latent_formats @@ -117,3 +118,7 @@ class BASE: def set_inference_dtype(self, dtype, manual_cast_dtype): self.unet_config['dtype'] = dtype self.manual_cast_dtype = manual_cast_dtype + + def __getattr__(self, name): + logging.warning("\nWARNING, you accessed {} from the model config object which doesn't exist. Please fix your code.\n".format(name)) + return None From bed12674a1d2c4bfdfbdd098686390f807996c90 Mon Sep 17 00:00:00 2001 From: "Dr.Lt.Data" <128333288+ltdrdata@users.noreply.github.com> Date: Sat, 6 Dec 2025 08:45:38 +0900 Subject: [PATCH 692/759] docs: add ComfyUI-Manager documentation and update to v4.0.3b4 (#11133) - Add manager setup instructions and command line options to README - Document --enable-manager, --enable-manager-legacy-ui, and --disable-manager-ui flags - Bump comfyui_manager version from 4.0.3b3 to 4.0.3b4 --- README.md | 26 ++++++++++++++++++++++++++ manager_requirements.txt | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed857df9f..bae955b1b 100644 --- a/README.md +++ b/README.md @@ -320,6 +320,32 @@ For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step 1. Install the Iluvatar Corex Toolkit by adhering to the platform-specific instructions on the [Installation](https://support.iluvatar.com/#/DocumentCentre?id=1&nameCenter=2&productId=520117912052801536) 2. Launch ComfyUI by running `python main.py` + +## [ComfyUI-Manager](https://github.com/Comfy-Org/ComfyUI-Manager/tree/manager-v4) + +**ComfyUI-Manager** is an extension that allows you to easily install, update, and manage custom nodes for ComfyUI. + +### Setup + +1. Install the manager dependencies: + ```bash + pip install -r manager_requirements.txt + ``` + +2. Enable the manager with the `--enable-manager` flag when running ComfyUI: + ```bash + python main.py --enable-manager + ``` + +### Command Line Options + +| Flag | Description | +|------|-------------| +| `--enable-manager` | Enable ComfyUI-Manager | +| `--enable-manager-legacy-ui` | Use the legacy manager UI instead of the new UI (requires `--enable-manager`) | +| `--disable-manager-ui` | Disable the manager UI and endpoints while keeping background features like security checks and scheduled installation completion (requires `--enable-manager`) | + + # Running ```python main.py``` diff --git a/manager_requirements.txt b/manager_requirements.txt index 52cc5389c..b95cefb74 100644 --- a/manager_requirements.txt +++ b/manager_requirements.txt @@ -1 +1 @@ -comfyui_manager==4.0.3b3 +comfyui_manager==4.0.3b4 From fd109325db7126f92c2dfb7e6b25310eded8c1f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Sat, 6 Dec 2025 05:20:22 +0200 Subject: [PATCH 693/759] Kandinsky5 model support (#10988) * Add Kandinsky5 model support lite and pro T2V tested to work * Update kandinsky5.py * Fix fp8 * Fix fp8_scaled text encoder * Add transformer_options for attention * Code cleanup, optimizations, use fp32 for all layers originally at fp32 * ImageToVideo -node * Fix I2V, add necessary latent post process nodes * Support text to image model * Support block replace patches (SLG mostly) * Support official LoRAs * Don't scale RoPE for lite model as that just doesn't work... * Update supported_models.py * Rever RoPE scaling to simpler one * Fix typo * Handle latent dim difference for image model in the VAE instead * Add node to use different prompts for clip_l and qwen25_7b * Reduce peak VRAM usage a bit * Further reduce peak VRAM consumption by chunking ffn * Update chunking * Update memory_usage_factor * Code cleanup, don't force the fp32 layers as it has minimal effect * Allow for stronger changes with first frames normalization Default values are too weak for any meaningful changes, these should probably be exposed as advanced node options when that's available. * Add image model's own chat template, remove unused image2video template * Remove hard error in ReplaceVideoLatentFrames -node * Update kandinsky5.py * Update supported_models.py * Fix typos in prompt template They were now fixed in the original repository as well * Update ReplaceVideoLatentFrames Add tooltips Make source optional Better handle negative index * Rename NormalizeVideoLatentFrames -node For bit better clarity what it does * Fix NormalizeVideoLatentStart node out on non-op --- comfy/ldm/kandinsky5/model.py | 407 ++++++++++++++++++++++++++++++ comfy/lora.py | 7 + comfy/model_base.py | 47 ++++ comfy/model_detection.py | 18 ++ comfy/sd.py | 11 + comfy/supported_models.py | 56 +++- comfy/text_encoders/kandinsky5.py | 68 +++++ comfy_api/latest/_io.py | 2 + comfy_extras/nodes_kandinsky5.py | 136 ++++++++++ comfy_extras/nodes_latent.py | 39 ++- nodes.py | 3 +- 11 files changed, 791 insertions(+), 3 deletions(-) create mode 100644 comfy/ldm/kandinsky5/model.py create mode 100644 comfy/text_encoders/kandinsky5.py create mode 100644 comfy_extras/nodes_kandinsky5.py diff --git a/comfy/ldm/kandinsky5/model.py b/comfy/ldm/kandinsky5/model.py new file mode 100644 index 000000000..a653e02fc --- /dev/null +++ b/comfy/ldm/kandinsky5/model.py @@ -0,0 +1,407 @@ +import torch +from torch import nn +import math + +import comfy.ldm.common_dit +from comfy.ldm.modules.attention import optimized_attention +from comfy.ldm.flux.math import apply_rope1 +from comfy.ldm.flux.layers import EmbedND + +def attention(q, k, v, heads, transformer_options={}): + return optimized_attention( + q.transpose(1, 2), + k.transpose(1, 2), + v.transpose(1, 2), + heads=heads, + skip_reshape=True, + transformer_options=transformer_options + ) + +def apply_scale_shift_norm(norm, x, scale, shift): + return torch.addcmul(shift, norm(x), scale + 1.0) + +def apply_gate_sum(x, out, gate): + return torch.addcmul(x, gate, out) + +def get_shift_scale_gate(params): + shift, scale, gate = torch.chunk(params, 3, dim=-1) + return tuple(x.unsqueeze(1) for x in (shift, scale, gate)) + +def get_freqs(dim, max_period=10000.0): + return torch.exp(-math.log(max_period) * torch.arange(start=0, end=dim, dtype=torch.float32) / dim) + + +class TimeEmbeddings(nn.Module): + def __init__(self, model_dim, time_dim, max_period=10000.0, operation_settings=None): + super().__init__() + assert model_dim % 2 == 0 + self.model_dim = model_dim + self.max_period = max_period + self.register_buffer("freqs", get_freqs(model_dim // 2, max_period), persistent=False) + operations = operation_settings.get("operations") + self.in_layer = operations.Linear(model_dim, time_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.activation = nn.SiLU() + self.out_layer = operations.Linear(time_dim, time_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + + def forward(self, timestep, dtype): + args = torch.outer(timestep, self.freqs.to(device=timestep.device)) + time_embed = torch.cat([torch.cos(args), torch.sin(args)], dim=-1).to(dtype) + time_embed = self.out_layer(self.activation(self.in_layer(time_embed))) + return time_embed + + +class TextEmbeddings(nn.Module): + def __init__(self, text_dim, model_dim, operation_settings=None): + super().__init__() + operations = operation_settings.get("operations") + self.in_layer = operations.Linear(text_dim, model_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.norm = operations.LayerNorm(model_dim, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + + def forward(self, text_embed): + text_embed = self.in_layer(text_embed) + return self.norm(text_embed).type_as(text_embed) + + +class VisualEmbeddings(nn.Module): + def __init__(self, visual_dim, model_dim, patch_size, operation_settings=None): + super().__init__() + self.patch_size = patch_size + operations = operation_settings.get("operations") + self.in_layer = operations.Linear(visual_dim, model_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + + def forward(self, x): + x = x.movedim(1, -1) # B C T H W -> B T H W C + B, T, H, W, dim = x.shape + pt, ph, pw = self.patch_size + + x = x.view( + B, + T // pt, pt, + H // ph, ph, + W // pw, pw, + dim, + ).permute(0, 1, 3, 5, 2, 4, 6, 7).flatten(4, 7) + + return self.in_layer(x) + + +class Modulation(nn.Module): + def __init__(self, time_dim, model_dim, num_params, operation_settings=None): + super().__init__() + self.activation = nn.SiLU() + self.out_layer = operation_settings.get("operations").Linear(time_dim, num_params * model_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + + def forward(self, x): + return self.out_layer(self.activation(x)) + + +class SelfAttention(nn.Module): + def __init__(self, num_channels, head_dim, operation_settings=None): + super().__init__() + assert num_channels % head_dim == 0 + self.num_heads = num_channels // head_dim + self.head_dim = head_dim + + operations = operation_settings.get("operations") + self.to_query = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.to_key = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.to_value = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.query_norm = operations.RMSNorm(head_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.key_norm = operations.RMSNorm(head_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + + self.out_layer = operations.Linear(num_channels, num_channels, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.num_chunks = 2 + + def _compute_qk(self, x, freqs, proj_fn, norm_fn): + result = proj_fn(x).view(*x.shape[:-1], self.num_heads, -1) + return apply_rope1(norm_fn(result), freqs) + + def _forward(self, x, freqs, transformer_options={}): + q = self._compute_qk(x, freqs, self.to_query, self.query_norm) + k = self._compute_qk(x, freqs, self.to_key, self.key_norm) + v = self.to_value(x).view(*x.shape[:-1], self.num_heads, -1) + out = attention(q, k, v, self.num_heads, transformer_options=transformer_options) + return self.out_layer(out) + + def _forward_chunked(self, x, freqs, transformer_options={}): + def process_chunks(proj_fn, norm_fn): + x_chunks = torch.chunk(x, self.num_chunks, dim=1) + freqs_chunks = torch.chunk(freqs, self.num_chunks, dim=1) + chunks = [] + for x_chunk, freqs_chunk in zip(x_chunks, freqs_chunks): + chunks.append(self._compute_qk(x_chunk, freqs_chunk, proj_fn, norm_fn)) + return torch.cat(chunks, dim=1) + + q = process_chunks(self.to_query, self.query_norm) + k = process_chunks(self.to_key, self.key_norm) + v = self.to_value(x).view(*x.shape[:-1], self.num_heads, -1) + out = attention(q, k, v, self.num_heads, transformer_options=transformer_options) + return self.out_layer(out) + + def forward(self, x, freqs, transformer_options={}): + if x.shape[1] > 8192: + return self._forward_chunked(x, freqs, transformer_options=transformer_options) + else: + return self._forward(x, freqs, transformer_options=transformer_options) + + +class CrossAttention(SelfAttention): + def get_qkv(self, x, context): + q = self.to_query(x).view(*x.shape[:-1], self.num_heads, -1) + k = self.to_key(context).view(*context.shape[:-1], self.num_heads, -1) + v = self.to_value(context).view(*context.shape[:-1], self.num_heads, -1) + return q, k, v + + def forward(self, x, context, transformer_options={}): + q, k, v = self.get_qkv(x, context) + out = attention(self.query_norm(q), self.key_norm(k), v, self.num_heads, transformer_options=transformer_options) + return self.out_layer(out) + + +class FeedForward(nn.Module): + def __init__(self, dim, ff_dim, operation_settings=None): + super().__init__() + operations = operation_settings.get("operations") + self.in_layer = operations.Linear(dim, ff_dim, bias=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.activation = nn.GELU() + self.out_layer = operations.Linear(ff_dim, dim, bias=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.num_chunks = 4 + + def _forward(self, x): + return self.out_layer(self.activation(self.in_layer(x))) + + def _forward_chunked(self, x): + chunks = torch.chunk(x, self.num_chunks, dim=1) + output_chunks = [] + for chunk in chunks: + output_chunks.append(self._forward(chunk)) + return torch.cat(output_chunks, dim=1) + + def forward(self, x): + if x.shape[1] > 8192: + return self._forward_chunked(x) + else: + return self._forward(x) + + +class OutLayer(nn.Module): + def __init__(self, model_dim, time_dim, visual_dim, patch_size, operation_settings=None): + super().__init__() + self.patch_size = patch_size + self.modulation = Modulation(time_dim, model_dim, 2, operation_settings=operation_settings) + operations = operation_settings.get("operations") + self.norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.out_layer = operations.Linear(model_dim, math.prod(patch_size) * visual_dim, bias=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + + def forward(self, visual_embed, time_embed): + B, T, H, W, _ = visual_embed.shape + shift, scale = torch.chunk(self.modulation(time_embed), 2, dim=-1) + scale = scale[:, None, None, None, :] + shift = shift[:, None, None, None, :] + visual_embed = apply_scale_shift_norm(self.norm, visual_embed, scale, shift) + x = self.out_layer(visual_embed) + + out_dim = x.shape[-1] // (self.patch_size[0] * self.patch_size[1] * self.patch_size[2]) + x = x.view( + B, T, H, W, + out_dim, + self.patch_size[0], self.patch_size[1], self.patch_size[2] + ) + return x.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(2, 3).flatten(3, 4).flatten(4, 5) + + +class TransformerEncoderBlock(nn.Module): + def __init__(self, model_dim, time_dim, ff_dim, head_dim, operation_settings=None): + super().__init__() + self.text_modulation = Modulation(time_dim, model_dim, 6, operation_settings=operation_settings) + operations = operation_settings.get("operations") + + self.self_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.self_attention = SelfAttention(model_dim, head_dim, operation_settings=operation_settings) + + self.feed_forward_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.feed_forward = FeedForward(model_dim, ff_dim, operation_settings=operation_settings) + + def forward(self, x, time_embed, freqs, transformer_options={}): + self_attn_params, ff_params = torch.chunk(self.text_modulation(time_embed), 2, dim=-1) + shift, scale, gate = get_shift_scale_gate(self_attn_params) + out = apply_scale_shift_norm(self.self_attention_norm, x, scale, shift) + out = self.self_attention(out, freqs, transformer_options=transformer_options) + x = apply_gate_sum(x, out, gate) + + shift, scale, gate = get_shift_scale_gate(ff_params) + out = apply_scale_shift_norm(self.feed_forward_norm, x, scale, shift) + out = self.feed_forward(out) + x = apply_gate_sum(x, out, gate) + return x + + +class TransformerDecoderBlock(nn.Module): + def __init__(self, model_dim, time_dim, ff_dim, head_dim, operation_settings=None): + super().__init__() + self.visual_modulation = Modulation(time_dim, model_dim, 9, operation_settings=operation_settings) + + operations = operation_settings.get("operations") + self.self_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.self_attention = SelfAttention(model_dim, head_dim, operation_settings=operation_settings) + + self.cross_attention_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.cross_attention = CrossAttention(model_dim, head_dim, operation_settings=operation_settings) + + self.feed_forward_norm = operations.LayerNorm(model_dim, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) + self.feed_forward = FeedForward(model_dim, ff_dim, operation_settings=operation_settings) + + def forward(self, visual_embed, text_embed, time_embed, freqs, transformer_options={}): + self_attn_params, cross_attn_params, ff_params = torch.chunk(self.visual_modulation(time_embed), 3, dim=-1) + # self attention + shift, scale, gate = get_shift_scale_gate(self_attn_params) + visual_out = apply_scale_shift_norm(self.self_attention_norm, visual_embed, scale, shift) + visual_out = self.self_attention(visual_out, freqs, transformer_options=transformer_options) + visual_embed = apply_gate_sum(visual_embed, visual_out, gate) + # cross attention + shift, scale, gate = get_shift_scale_gate(cross_attn_params) + visual_out = apply_scale_shift_norm(self.cross_attention_norm, visual_embed, scale, shift) + visual_out = self.cross_attention(visual_out, text_embed, transformer_options=transformer_options) + visual_embed = apply_gate_sum(visual_embed, visual_out, gate) + # feed forward + shift, scale, gate = get_shift_scale_gate(ff_params) + visual_out = apply_scale_shift_norm(self.feed_forward_norm, visual_embed, scale, shift) + visual_out = self.feed_forward(visual_out) + visual_embed = apply_gate_sum(visual_embed, visual_out, gate) + return visual_embed + + +class Kandinsky5(nn.Module): + def __init__( + self, + in_visual_dim=16, out_visual_dim=16, in_text_dim=3584, in_text_dim2=768, time_dim=512, + model_dim=1792, ff_dim=7168, visual_embed_dim=132, patch_size=(1, 2, 2), num_text_blocks=2, num_visual_blocks=32, + axes_dims=(16, 24, 24), rope_scale_factor=(1.0, 2.0, 2.0), + dtype=None, device=None, operations=None, **kwargs + ): + super().__init__() + head_dim = sum(axes_dims) + self.rope_scale_factor = rope_scale_factor + self.in_visual_dim = in_visual_dim + self.model_dim = model_dim + self.patch_size = patch_size + self.visual_embed_dim = visual_embed_dim + self.dtype = dtype + self.device = device + operation_settings = {"operations": operations, "device": device, "dtype": dtype} + + self.time_embeddings = TimeEmbeddings(model_dim, time_dim, operation_settings=operation_settings) + self.text_embeddings = TextEmbeddings(in_text_dim, model_dim, operation_settings=operation_settings) + self.pooled_text_embeddings = TextEmbeddings(in_text_dim2, time_dim, operation_settings=operation_settings) + self.visual_embeddings = VisualEmbeddings(visual_embed_dim, model_dim, patch_size, operation_settings=operation_settings) + + self.text_transformer_blocks = nn.ModuleList( + [TransformerEncoderBlock(model_dim, time_dim, ff_dim, head_dim, operation_settings=operation_settings) for _ in range(num_text_blocks)] + ) + + self.visual_transformer_blocks = nn.ModuleList( + [TransformerDecoderBlock(model_dim, time_dim, ff_dim, head_dim, operation_settings=operation_settings) for _ in range(num_visual_blocks)] + ) + + self.out_layer = OutLayer(model_dim, time_dim, out_visual_dim, patch_size, operation_settings=operation_settings) + + self.rope_embedder_3d = EmbedND(dim=head_dim, theta=10000.0, axes_dim=axes_dims) + self.rope_embedder_1d = EmbedND(dim=head_dim, theta=10000.0, axes_dim=[head_dim]) + + def rope_encode_1d(self, seq_len, seq_start=0, steps=None, device=None, dtype=None, transformer_options={}): + steps = seq_len if steps is None else steps + seq_ids = torch.linspace(seq_start, seq_start + (seq_len - 1), steps=steps, device=device, dtype=dtype) + seq_ids = seq_ids.reshape(-1, 1).unsqueeze(0) # Shape: (1, steps, 1) + freqs = self.rope_embedder_1d(seq_ids).movedim(1, 2) + return freqs + + def rope_encode_3d(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, transformer_options={}): + + patch_size = self.patch_size + t_len = ((t + (patch_size[0] // 2)) // patch_size[0]) + h_len = ((h + (patch_size[1] // 2)) // patch_size[1]) + w_len = ((w + (patch_size[2] // 2)) // patch_size[2]) + + if steps_t is None: + steps_t = t_len + if steps_h is None: + steps_h = h_len + if steps_w is None: + steps_w = w_len + + h_start = 0 + w_start = 0 + rope_options = transformer_options.get("rope_options", None) + if rope_options is not None: + t_len = (t_len - 1.0) * rope_options.get("scale_t", 1.0) + 1.0 + h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0 + w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0 + + t_start += rope_options.get("shift_t", 0.0) + h_start += rope_options.get("shift_y", 0.0) + w_start += rope_options.get("shift_x", 0.0) + else: + rope_scale_factor = self.rope_scale_factor + if self.model_dim == 4096: # pro video model uses different rope scaling at higher resolutions + if h * w >= 14080: + rope_scale_factor = (1.0, 3.16, 3.16) + + t_len = (t_len - 1.0) / rope_scale_factor[0] + 1.0 + h_len = (h_len - 1.0) / rope_scale_factor[1] + 1.0 + w_len = (w_len - 1.0) / rope_scale_factor[2] + 1.0 + + img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype) + img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1) + img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(h_start, h_start + (h_len - 1), steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1) + img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(w_start, w_start + (w_len - 1), steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1) + img_ids = img_ids.reshape(1, -1, img_ids.shape[-1]) + + freqs = self.rope_embedder_3d(img_ids).movedim(1, 2) + return freqs + + def forward_orig(self, x, timestep, context, y, freqs, freqs_text, transformer_options={}, **kwargs): + patches_replace = transformer_options.get("patches_replace", {}) + context = self.text_embeddings(context) + time_embed = self.time_embeddings(timestep, x.dtype) + self.pooled_text_embeddings(y) + + for block in self.text_transformer_blocks: + context = block(context, time_embed, freqs_text, transformer_options=transformer_options) + + visual_embed = self.visual_embeddings(x) + visual_shape = visual_embed.shape[:-1] + visual_embed = visual_embed.flatten(1, -2) + + blocks_replace = patches_replace.get("dit", {}) + transformer_options["total_blocks"] = len(self.visual_transformer_blocks) + transformer_options["block_type"] = "double" + for i, block in enumerate(self.visual_transformer_blocks): + transformer_options["block_index"] = i + if ("double_block", i) in blocks_replace: + def block_wrap(args): + return block(x=args["x"], context=args["context"], time_embed=args["time_embed"], freqs=args["freqs"], transformer_options=args.get("transformer_options")) + visual_embed = blocks_replace[("double_block", i)]({"x": visual_embed, "context": context, "time_embed": time_embed, "freqs": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})["x"] + else: + visual_embed = block(visual_embed, context, time_embed, freqs=freqs, transformer_options=transformer_options) + + visual_embed = visual_embed.reshape(*visual_shape, -1) + return self.out_layer(visual_embed, time_embed) + + def _forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options={}, **kwargs): + bs, c, t_len, h, w = x.shape + x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size) + + if time_dim_replace is not None: + time_dim_replace = comfy.ldm.common_dit.pad_to_patch_size(time_dim_replace, self.patch_size) + x[:, :time_dim_replace.shape[1], :time_dim_replace.shape[2]] = time_dim_replace + + freqs = self.rope_encode_3d(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options) + freqs_text = self.rope_encode_1d(context.shape[1], device=x.device, dtype=x.dtype, transformer_options=transformer_options) + + return self.forward_orig(x, timestep, context, y, freqs, freqs_text, transformer_options=transformer_options, **kwargs) + + def forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, + self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options) + ).execute(x, timestep, context, y, time_dim_replace=time_dim_replace, transformer_options=transformer_options, **kwargs) diff --git a/comfy/lora.py b/comfy/lora.py index 3a9077869..e7202ce97 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -322,6 +322,13 @@ def model_lora_keys_unet(model, key_map={}): key_map["diffusion_model.{}".format(key_lora)] = to key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = to + if isinstance(model, comfy.model_base.Kandinsky5): + for k in sdk: + if k.startswith("diffusion_model.") and k.endswith(".weight"): + key_lora = k[len("diffusion_model."):-len(".weight")] + key_map["{}".format(key_lora)] = k + key_map["transformer.{}".format(key_lora)] = k + return key_map diff --git a/comfy/model_base.py b/comfy/model_base.py index 3cedd4f31..0be006cc2 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -47,6 +47,7 @@ import comfy.ldm.chroma_radiance.model import comfy.ldm.ace.model import comfy.ldm.omnigen.omnigen2 import comfy.ldm.qwen_image.model +import comfy.ldm.kandinsky5.model import comfy.model_management import comfy.patcher_extension @@ -1630,3 +1631,49 @@ class HunyuanVideo15_SR_Distilled(HunyuanVideo15): out = super().extra_conds(**kwargs) out['disable_time_r'] = comfy.conds.CONDConstant(False) return out + +class Kandinsky5(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.kandinsky5.model.Kandinsky5) + + def encode_adm(self, **kwargs): + return kwargs["pooled_output"] + + def concat_cond(self, **kwargs): + noise = kwargs.get("noise", None) + device = kwargs["device"] + image = torch.zeros_like(noise) + + mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None)) + if mask is None: + mask = torch.zeros_like(noise)[:, :1] + else: + mask = 1.0 - mask + mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") + if mask.shape[-3] < noise.shape[-3]: + mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0) + mask = utils.resize_to_batch_size(mask, noise.shape[0]) + + return torch.cat((image, mask), dim=1) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None: + out['attention_mask'] = comfy.conds.CONDRegular(attention_mask) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + + time_dim_replace = kwargs.get("time_dim_replace", None) + if time_dim_replace is not None: + out['time_dim_replace'] = comfy.conds.CONDRegular(self.process_latent_in(time_dim_replace)) + + return out + +class Kandinsky5Image(Kandinsky5): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device) + + def concat_cond(self, **kwargs): + return None diff --git a/comfy/model_detection.py b/comfy/model_detection.py index fd1907627..30b33a486 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -611,6 +611,24 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.') return dit_config + if '{}visual_transformer_blocks.0.cross_attention.key_norm.weight'.format(key_prefix) in state_dict_keys: # Kandinsky 5 + dit_config = {} + model_dim = state_dict['{}visual_embeddings.in_layer.bias'.format(key_prefix)].shape[0] + dit_config["model_dim"] = model_dim + if model_dim in [4096, 2560]: # pro video and lite image + dit_config["axes_dims"] = (32, 48, 48) + if model_dim == 2560: # lite image + dit_config["rope_scale_factor"] = (1.0, 1.0, 1.0) + elif model_dim == 1792: # lite video + dit_config["axes_dims"] = (16, 24, 24) + dit_config["time_dim"] = state_dict['{}time_embeddings.in_layer.bias'.format(key_prefix)].shape[0] + dit_config["image_model"] = "kandinsky5" + dit_config["ff_dim"] = state_dict['{}visual_transformer_blocks.0.feed_forward.in_layer.weight'.format(key_prefix)].shape[0] + dit_config["visual_embed_dim"] = state_dict['{}visual_embeddings.in_layer.weight'.format(key_prefix)].shape[1] + dit_config["num_text_blocks"] = count_blocks(state_dict_keys, '{}text_transformer_blocks.'.format(key_prefix) + '{}.') + dit_config["num_visual_blocks"] = count_blocks(state_dict_keys, '{}visual_transformer_blocks.'.format(key_prefix) + '{}.') + return dit_config + if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys: return None diff --git a/comfy/sd.py b/comfy/sd.py index c350322f8..754b1703d 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -54,6 +54,7 @@ import comfy.text_encoders.qwen_image import comfy.text_encoders.hunyuan_image import comfy.text_encoders.z_image import comfy.text_encoders.ovis +import comfy.text_encoders.kandinsky5 import comfy.model_patcher import comfy.lora @@ -766,6 +767,8 @@ class VAE: self.throw_exception_if_invalid() pixel_samples = None do_tile = False + if self.latent_dim == 2 and samples_in.ndim == 5: + samples_in = samples_in[:, :, 0] try: memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype) model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) @@ -983,6 +986,8 @@ class CLIPType(Enum): HUNYUAN_IMAGE = 19 HUNYUAN_VIDEO_15 = 20 OVIS = 21 + KANDINSKY5 = 22 + KANDINSKY5_IMAGE = 23 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}): @@ -1231,6 +1236,12 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip elif clip_type == CLIPType.HUNYUAN_VIDEO_15: clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer + elif clip_type == CLIPType.KANDINSKY5: + clip_target.clip = comfy.text_encoders.kandinsky5.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.kandinsky5.Kandinsky5Tokenizer + elif clip_type == CLIPType.KANDINSKY5_IMAGE: + clip_target.clip = comfy.text_encoders.kandinsky5.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage else: clip_target.clip = sdxl_clip.SDXLClipModel clip_target.tokenizer = sdxl_clip.SDXLTokenizer diff --git a/comfy/supported_models.py b/comfy/supported_models.py index afd97160b..91cc4ef08 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -21,6 +21,7 @@ import comfy.text_encoders.ace import comfy.text_encoders.omnigen2 import comfy.text_encoders.qwen_image import comfy.text_encoders.hunyuan_image +import comfy.text_encoders.kandinsky5 import comfy.text_encoders.z_image from . import supported_models_base @@ -1474,7 +1475,60 @@ class HunyuanVideo15_SR_Distilled(HunyuanVideo): hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2] +class Kandinsky5(supported_models_base.BASE): + unet_config = { + "image_model": "kandinsky5", + } + + sampling_settings = { + "shift": 10.0, + } + + unet_extra_config = {} + latent_format = latent_formats.HunyuanVideo + + memory_usage_factor = 1.1 #TODO + + supported_inference_dtypes = [torch.bfloat16, torch.float32] + + vae_key_prefix = ["vae."] + text_encoder_key_prefix = ["text_encoders."] + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.Kandinsky5(self, device=device) + return out + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) + return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5Tokenizer, comfy.text_encoders.kandinsky5.te(**hunyuan_detect)) + + +class Kandinsky5Image(Kandinsky5): + unet_config = { + "image_model": "kandinsky5", + "model_dim": 2560, + "visual_embed_dim": 64, + } + + sampling_settings = { + "shift": 3.0, + } + + latent_format = latent_formats.Flux + memory_usage_factor = 1.1 #TODO + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.Kandinsky5Image(self, device=device) + return out + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) + return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage, comfy.text_encoders.kandinsky5.te(**hunyuan_detect)) + + +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Kandinsky5Image, Kandinsky5] models += [SVD_img2vid] diff --git a/comfy/text_encoders/kandinsky5.py b/comfy/text_encoders/kandinsky5.py new file mode 100644 index 000000000..22f991c36 --- /dev/null +++ b/comfy/text_encoders/kandinsky5.py @@ -0,0 +1,68 @@ +from comfy import sd1_clip +from .qwen_image import QwenImageTokenizer, QwenImageTEModel +from .llama import Qwen25_7BVLI + + +class Kandinsky5Tokenizer(QwenImageTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data) + self.llama_template = "<|im_start|>system\nYou are a prompt engineer. Describe the video in detail.\nDescribe how the camera moves or shakes, describe the zoom and view angle, whether it follows the objects.\nDescribe the location of the video, main characters or objects and their action.\nDescribe the dynamism of the video and presented actions.\nName the visual style of the video: whether it is a professional footage, user generated content, some kind of animation, video game or screen content.\nDescribe the visual effects, postprocessing and transitions if they are presented in the video.\nPay attention to the order of key actions shown in the scene.<|im_end|>\n<|im_start|>user\n{}<|im_end|>" + self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data) + + def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs): + out = super().tokenize_with_weights(text, return_word_ids, **kwargs) + out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids, **kwargs) + + return out + + +class Kandinsky5TokenizerImage(Kandinsky5Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data) + self.llama_template = "<|im_start|>system\nYou are a promt engineer. Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>" + + +class Qwen25_7BVLIModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="hidden", layer_idx=-1, dtype=None, attention_mask=True, model_options={}): + llama_scaled_fp8 = model_options.get("qwen_scaled_fp8", None) + if llama_scaled_fp8 is not None: + model_options = model_options.copy() + model_options["scaled_fp8"] = llama_scaled_fp8 + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) + + +class Kandinsky5TEModel(QwenImageTEModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + super(QwenImageTEModel, self).__init__(device=device, dtype=dtype, name="qwen25_7b", clip_model=Qwen25_7BVLIModel, model_options=model_options) + self.clip_l = sd1_clip.SDClipModel(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options) + + def encode_token_weights(self, token_weight_pairs): + cond, p, extra = super().encode_token_weights(token_weight_pairs, template_end=-1) + l_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs["l"]) + + return cond, l_pooled, extra + + def set_clip_options(self, options): + super().set_clip_options(options) + self.clip_l.set_clip_options(options) + + def reset_clip_options(self): + super().reset_clip_options() + self.clip_l.reset_clip_options() + + def load_sd(self, sd): + if "text_model.encoder.layers.1.mlp.fc1.weight" in sd: + return self.clip_l.load_sd(sd) + else: + return super().load_sd(sd) + +def te(dtype_llama=None, llama_scaled_fp8=None): + class Kandinsky5TEModel_(Kandinsky5TEModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: + model_options = model_options.copy() + model_options["qwen_scaled_fp8"] = llama_scaled_fp8 + if dtype_llama is not None: + dtype = dtype_llama + super().__init__(device=device, dtype=dtype, model_options=model_options) + return Kandinsky5TEModel_ diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 866c3e0eb..d7cbe68cf 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -568,6 +568,8 @@ class Conditioning(ComfyTypeIO): '''Used by WAN Camera.''' time_dim_concat: NotRequired[torch.Tensor] '''Used by WAN Phantom Subject.''' + time_dim_replace: NotRequired[torch.Tensor] + '''Used by Kandinsky5 I2V.''' CondList = list[tuple[torch.Tensor, PooledDict]] Type = CondList diff --git a/comfy_extras/nodes_kandinsky5.py b/comfy_extras/nodes_kandinsky5.py new file mode 100644 index 000000000..9cb234be1 --- /dev/null +++ b/comfy_extras/nodes_kandinsky5.py @@ -0,0 +1,136 @@ +import nodes +import node_helpers +import torch +import comfy.model_management +import comfy.utils + +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io + + +class Kandinsky5ImageToVideo(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="Kandinsky5ImageToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Int.Input("width", default=768, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=512, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=121, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("start_image", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent", tooltip="Empty video latent"), + io.Latent.Output(display_name="cond_latent", tooltip="Clean encoded start images, used to replace the noisy start of the model output latents"), + ], + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None) -> io.NodeOutput: + latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + cond_latent_out = {} + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + encoded = vae.encode(start_image[:, :, :, :3]) + cond_latent_out["samples"] = encoded + + mask = torch.ones((1, 1, latent.shape[2], latent.shape[-2], latent.shape[-1]), device=start_image.device, dtype=start_image.dtype) + mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0 + + positive = node_helpers.conditioning_set_values(positive, {"time_dim_replace": encoded, "concat_mask": mask}) + negative = node_helpers.conditioning_set_values(negative, {"time_dim_replace": encoded, "concat_mask": mask}) + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent, cond_latent_out) + + +def adaptive_mean_std_normalization(source, reference, clump_mean_low=0.3, clump_mean_high=0.35, clump_std_low=0.35, clump_std_high=0.5): + source_mean = source.mean(dim=(1, 3, 4), keepdim=True) # mean over C, H, W + source_std = source.std(dim=(1, 3, 4), keepdim=True) # std over C, H, W + + reference_mean = torch.clamp(reference.mean(), source_mean - clump_mean_low, source_mean + clump_mean_high) + reference_std = torch.clamp(reference.std(), source_std - clump_std_low, source_std + clump_std_high) + + # normalization + normalized = (source - source_mean) / (source_std + 1e-8) + normalized = normalized * reference_std + reference_mean + + return normalized + + +class NormalizeVideoLatentStart(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="NormalizeVideoLatentStart", + category="conditioning/video_models", + description="Normalizes the initial frames of a video latent to match the mean and standard deviation of subsequent reference frames. Helps reduce differences between the starting frames and the rest of the video.", + inputs=[ + io.Latent.Input("latent"), + io.Int.Input("start_frame_count", default=4, min=1, max=nodes.MAX_RESOLUTION, step=1, tooltip="Number of latent frames to normalize, counted from the start"), + io.Int.Input("reference_frame_count", default=5, min=1, max=nodes.MAX_RESOLUTION, step=1, tooltip="Number of latent frames after the start frames to use as reference"), + ], + outputs=[ + io.Latent.Output(display_name="latent"), + ], + ) + + @classmethod + def execute(cls, latent, start_frame_count, reference_frame_count) -> io.NodeOutput: + if latent["samples"].shape[2] <= 1: + return io.NodeOutput(latent) + s = latent.copy() + samples = latent["samples"].clone() + + first_frames = samples[:, :, :start_frame_count] + reference_frames_data = samples[:, :, start_frame_count:start_frame_count+min(reference_frame_count, samples.shape[2]-1)] + normalized_first_frames = adaptive_mean_std_normalization(first_frames, reference_frames_data) + + samples[:, :, :start_frame_count] = normalized_first_frames + s["samples"] = samples + return io.NodeOutput(s) + + +class CLIPTextEncodeKandinsky5(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodeKandinsky5", + category="advanced/conditioning/kandinsky5", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("clip_l", multiline=True, dynamic_prompts=True), + io.String.Input("qwen25_7b", multiline=True, dynamic_prompts=True), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) + + @classmethod + def execute(cls, clip, clip_l, qwen25_7b) -> io.NodeOutput: + tokens = clip.tokenize(clip_l) + tokens["qwen25_7b"] = clip.tokenize(qwen25_7b)["qwen25_7b"] + + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens)) + + +class Kandinsky5Extension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + Kandinsky5ImageToVideo, + NormalizeVideoLatentStart, + CLIPTextEncodeKandinsky5, + ] + +async def comfy_entrypoint() -> Kandinsky5Extension: + return Kandinsky5Extension() diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py index d2df07ff9..e439b18ef 100644 --- a/comfy_extras/nodes_latent.py +++ b/comfy_extras/nodes_latent.py @@ -4,7 +4,7 @@ import torch import nodes from typing_extensions import override from comfy_api.latest import ComfyExtension, io - +import logging def reshape_latent_to(target_shape, latent, repeat_batch=True): if latent.shape[1:] != target_shape[1:]: @@ -388,6 +388,42 @@ class LatentOperationSharpen(io.ComfyNode): return luminance * sharpened return io.NodeOutput(sharpen) +class ReplaceVideoLatentFrames(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="ReplaceVideoLatentFrames", + category="latent/batch", + inputs=[ + io.Latent.Input("destination", tooltip="The destination latent where frames will be replaced."), + io.Latent.Input("source", optional=True, tooltip="The source latent providing frames to insert into the destination latent. If not provided, the destination latent is returned unchanged."), + io.Int.Input("index", default=0, min=-nodes.MAX_RESOLUTION, max=nodes.MAX_RESOLUTION, step=1, tooltip="The starting latent frame index in the destination latent where the source latent frames will be placed. Negative values count from the end."), + ], + outputs=[ + io.Latent.Output(), + ], + ) + + @classmethod + def execute(cls, destination, index, source=None) -> io.NodeOutput: + if source is None: + return io.NodeOutput(destination) + dest_frames = destination["samples"].shape[2] + source_frames = source["samples"].shape[2] + if index < 0: + index = dest_frames + index + if index > dest_frames: + logging.warning(f"ReplaceVideoLatentFrames: Index {index} is out of bounds for destination latent frames {dest_frames}.") + return io.NodeOutput(destination) + if index + source_frames > dest_frames: + logging.warning(f"ReplaceVideoLatentFrames: Source latent frames {source_frames} do not fit within destination latent frames {dest_frames} at the specified index {index}.") + return io.NodeOutput(destination) + s = source.copy() + s_source = source["samples"] + s_destination = destination["samples"].clone() + s_destination[:, :, index:index + s_source.shape[2]] = s_source + s["samples"] = s_destination + return io.NodeOutput(s) class LatentExtension(ComfyExtension): @override @@ -405,6 +441,7 @@ class LatentExtension(ComfyExtension): LatentApplyOperationCFG, LatentOperationTonemapReinhard, LatentOperationSharpen, + ReplaceVideoLatentFrames ] diff --git a/nodes.py b/nodes.py index 356aa63df..8d28a725d 100644 --- a/nodes.py +++ b/nodes.py @@ -970,7 +970,7 @@ class DualCLIPLoader: def INPUT_TYPES(s): return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ), "clip_name2": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15"], ), + "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15", "kandinsky5", "kandinsky5_image"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), @@ -2357,6 +2357,7 @@ async def init_builtin_extra_nodes(): "nodes_rope.py", "nodes_logic.py", "nodes_nop.py", + "nodes_kandinsky5.py", ] import_failed = [] From ae676ed105663bb225153c8bca406f00edf738b4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 5 Dec 2025 20:01:19 -0800 Subject: [PATCH 694/759] Fix regression. (#11137) --- comfy/supported_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 91cc4ef08..383c82c3e 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1529,6 +1529,6 @@ class Kandinsky5Image(Kandinsky5): return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage, comfy.text_encoders.kandinsky5.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Kandinsky5Image, Kandinsky5] +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5] models += [SVD_img2vid] From 117bf3f2bd9235cb5942a1de10a534c9869c7444 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 6 Dec 2025 06:22:02 +0200 Subject: [PATCH 695/759] convert nodes_freelunch.py to the V3 schema (#10904) --- comfy_extras/nodes_freelunch.py | 89 +++++++++++++++++---------- comfy_extras/nodes_model_downscale.py | 5 -- 2 files changed, 57 insertions(+), 37 deletions(-) diff --git a/comfy_extras/nodes_freelunch.py b/comfy_extras/nodes_freelunch.py index e3ac58447..3429b731e 100644 --- a/comfy_extras/nodes_freelunch.py +++ b/comfy_extras/nodes_freelunch.py @@ -2,6 +2,8 @@ import torch import logging +from typing_extensions import override +from comfy_api.latest import ComfyExtension, IO def Fourier_filter(x, threshold, scale): # FFT @@ -22,21 +24,26 @@ def Fourier_filter(x, threshold, scale): return x_filtered.to(x.dtype) -class FreeU: +class FreeU(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "b1": ("FLOAT", {"default": 1.1, "min": 0.0, "max": 10.0, "step": 0.01}), - "b2": ("FLOAT", {"default": 1.2, "min": 0.0, "max": 10.0, "step": 0.01}), - "s1": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 10.0, "step": 0.01}), - "s2": ("FLOAT", {"default": 0.2, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return IO.Schema( + node_id="FreeU", + category="model_patches/unet", + inputs=[ + IO.Model.Input("model"), + IO.Float.Input("b1", default=1.1, min=0.0, max=10.0, step=0.01), + IO.Float.Input("b2", default=1.2, min=0.0, max=10.0, step=0.01), + IO.Float.Input("s1", default=0.9, min=0.0, max=10.0, step=0.01), + IO.Float.Input("s2", default=0.2, min=0.0, max=10.0, step=0.01), + ], + outputs=[ + IO.Model.Output(), + ], + ) - CATEGORY = "model_patches/unet" - - def patch(self, model, b1, b2, s1, s2): + @classmethod + def execute(cls, model, b1, b2, s1, s2) -> IO.NodeOutput: model_channels = model.model.model_config.unet_config["model_channels"] scale_dict = {model_channels * 4: (b1, s1), model_channels * 2: (b2, s2)} on_cpu_devices = {} @@ -59,23 +66,31 @@ class FreeU: m = model.clone() m.set_model_output_block_patch(output_block_patch) - return (m, ) + return IO.NodeOutput(m) -class FreeU_V2: + patch = execute # TODO: remove + + +class FreeU_V2(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "b1": ("FLOAT", {"default": 1.3, "min": 0.0, "max": 10.0, "step": 0.01}), - "b2": ("FLOAT", {"default": 1.4, "min": 0.0, "max": 10.0, "step": 0.01}), - "s1": ("FLOAT", {"default": 0.9, "min": 0.0, "max": 10.0, "step": 0.01}), - "s2": ("FLOAT", {"default": 0.2, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return IO.Schema( + node_id="FreeU_V2", + category="model_patches/unet", + inputs=[ + IO.Model.Input("model"), + IO.Float.Input("b1", default=1.3, min=0.0, max=10.0, step=0.01), + IO.Float.Input("b2", default=1.4, min=0.0, max=10.0, step=0.01), + IO.Float.Input("s1", default=0.9, min=0.0, max=10.0, step=0.01), + IO.Float.Input("s2", default=0.2, min=0.0, max=10.0, step=0.01), + ], + outputs=[ + IO.Model.Output(), + ], + ) - CATEGORY = "model_patches/unet" - - def patch(self, model, b1, b2, s1, s2): + @classmethod + def execute(cls, model, b1, b2, s1, s2) -> IO.NodeOutput: model_channels = model.model.model_config.unet_config["model_channels"] scale_dict = {model_channels * 4: (b1, s1), model_channels * 2: (b2, s2)} on_cpu_devices = {} @@ -105,9 +120,19 @@ class FreeU_V2: m = model.clone() m.set_model_output_block_patch(output_block_patch) - return (m, ) + return IO.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "FreeU": FreeU, - "FreeU_V2": FreeU_V2, -} + patch = execute # TODO: remove + + +class FreelunchExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + FreeU, + FreeU_V2, + ] + + +async def comfy_entrypoint() -> FreelunchExtension: + return FreelunchExtension() diff --git a/comfy_extras/nodes_model_downscale.py b/comfy_extras/nodes_model_downscale.py index f7ca9699d..dec2ae841 100644 --- a/comfy_extras/nodes_model_downscale.py +++ b/comfy_extras/nodes_model_downscale.py @@ -53,11 +53,6 @@ class PatchModelAddDownscale(io.ComfyNode): return io.NodeOutput(m) -NODE_DISPLAY_NAME_MAPPINGS = { - # Sampling - "PatchModelAddDownscale": "", -} - class ModelDownscaleExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: From 913f86b72740f84f759786a698108840a09b6498 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 6 Dec 2025 06:24:10 +0200 Subject: [PATCH 696/759] [V3] convert nodes_mask.py to V3 schema (#10669) * convert nodes_mask.py to V3 schema * set "Preview Mask" as display name for MaskPreview --- comfy_extras/nodes_mask.py | 508 +++++++++++++++++++------------------ 1 file changed, 263 insertions(+), 245 deletions(-) diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index a5e405008..290e6f55e 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -3,11 +3,10 @@ import scipy.ndimage import torch import comfy.utils import node_helpers -import folder_paths -import random +from typing_extensions import override +from comfy_api.latest import ComfyExtension, IO, UI import nodes -from nodes import MAX_RESOLUTION def composite(destination, source, x, y, mask = None, multiplier = 8, resize_source = False): source = source.to(destination.device) @@ -46,202 +45,213 @@ def composite(destination, source, x, y, mask = None, multiplier = 8, resize_sou destination[..., top:bottom, left:right] = source_portion + destination_portion return destination -class LatentCompositeMasked: +class LatentCompositeMasked(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "destination": ("LATENT",), - "source": ("LATENT",), - "x": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 8}), - "y": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 8}), - "resize_source": ("BOOLEAN", {"default": False}), - }, - "optional": { - "mask": ("MASK",), - } - } - RETURN_TYPES = ("LATENT",) - FUNCTION = "composite" + def define_schema(cls): + return IO.Schema( + node_id="LatentCompositeMasked", + category="latent", + inputs=[ + IO.Latent.Input("destination"), + IO.Latent.Input("source"), + IO.Int.Input("x", default=0, min=0, max=nodes.MAX_RESOLUTION, step=8), + IO.Int.Input("y", default=0, min=0, max=nodes.MAX_RESOLUTION, step=8), + IO.Boolean.Input("resize_source", default=False), + IO.Mask.Input("mask", optional=True), + ], + outputs=[IO.Latent.Output()], + ) - CATEGORY = "latent" - - def composite(self, destination, source, x, y, resize_source, mask = None): + @classmethod + def execute(cls, destination, source, x, y, resize_source, mask = None) -> IO.NodeOutput: output = destination.copy() destination = destination["samples"].clone() source = source["samples"] output["samples"] = composite(destination, source, x, y, mask, 8, resize_source) - return (output,) + return IO.NodeOutput(output) -class ImageCompositeMasked: + composite = execute # TODO: remove + + +class ImageCompositeMasked(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "destination": ("IMAGE",), - "source": ("IMAGE",), - "x": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 1}), - "y": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 1}), - "resize_source": ("BOOLEAN", {"default": False}), - }, - "optional": { - "mask": ("MASK",), - } - } - RETURN_TYPES = ("IMAGE",) - FUNCTION = "composite" + def define_schema(cls): + return IO.Schema( + node_id="ImageCompositeMasked", + category="image", + inputs=[ + IO.Image.Input("destination"), + IO.Image.Input("source"), + IO.Int.Input("x", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1), + IO.Int.Input("y", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1), + IO.Boolean.Input("resize_source", default=False), + IO.Mask.Input("mask", optional=True), + ], + outputs=[IO.Image.Output()], + ) - CATEGORY = "image" - - def composite(self, destination, source, x, y, resize_source, mask = None): + @classmethod + def execute(cls, destination, source, x, y, resize_source, mask = None) -> IO.NodeOutput: destination, source = node_helpers.image_alpha_fix(destination, source) destination = destination.clone().movedim(-1, 1) output = composite(destination, source.movedim(-1, 1), x, y, mask, 1, resize_source).movedim(1, -1) - return (output,) + return IO.NodeOutput(output) -class MaskToImage: + composite = execute # TODO: remove + + +class MaskToImage(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "mask": ("MASK",), - } - } + def define_schema(cls): + return IO.Schema( + node_id="MaskToImage", + display_name="Convert Mask to Image", + category="mask", + inputs=[ + IO.Mask.Input("mask"), + ], + outputs=[IO.Image.Output()], + ) - CATEGORY = "mask" - - RETURN_TYPES = ("IMAGE",) - FUNCTION = "mask_to_image" - - def mask_to_image(self, mask): + @classmethod + def execute(cls, mask) -> IO.NodeOutput: result = mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])).movedim(1, -1).expand(-1, -1, -1, 3) - return (result,) + return IO.NodeOutput(result) -class ImageToMask: + mask_to_image = execute # TODO: remove + + +class ImageToMask(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - "channel": (["red", "green", "blue", "alpha"],), - } - } + def define_schema(cls): + return IO.Schema( + node_id="ImageToMask", + display_name="Convert Image to Mask", + category="mask", + inputs=[ + IO.Image.Input("image"), + IO.Combo.Input("channel", options=["red", "green", "blue", "alpha"]), + ], + outputs=[IO.Mask.Output()], + ) - CATEGORY = "mask" - - RETURN_TYPES = ("MASK",) - FUNCTION = "image_to_mask" - - def image_to_mask(self, image, channel): + @classmethod + def execute(cls, image, channel) -> IO.NodeOutput: channels = ["red", "green", "blue", "alpha"] mask = image[:, :, :, channels.index(channel)] - return (mask,) + return IO.NodeOutput(mask) -class ImageColorToMask: + image_to_mask = execute # TODO: remove + + +class ImageColorToMask(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - "color": ("INT", {"default": 0, "min": 0, "max": 0xFFFFFF, "step": 1, "display": "color"}), - } - } + def define_schema(cls): + return IO.Schema( + node_id="ImageColorToMask", + category="mask", + inputs=[ + IO.Image.Input("image"), + IO.Int.Input("color", default=0, min=0, max=0xFFFFFF, step=1, display_mode=IO.NumberDisplay.number), + ], + outputs=[IO.Mask.Output()], + ) - CATEGORY = "mask" - - RETURN_TYPES = ("MASK",) - FUNCTION = "image_to_mask" - - def image_to_mask(self, image, color): + @classmethod + def execute(cls, image, color) -> IO.NodeOutput: temp = (torch.clamp(image, 0, 1.0) * 255.0).round().to(torch.int) temp = torch.bitwise_left_shift(temp[:,:,:,0], 16) + torch.bitwise_left_shift(temp[:,:,:,1], 8) + temp[:,:,:,2] mask = torch.where(temp == color, 1.0, 0).float() - return (mask,) + return IO.NodeOutput(mask) -class SolidMask: + image_to_mask = execute # TODO: remove + + +class SolidMask(IO.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "value": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - "width": ("INT", {"default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 1}), - "height": ("INT", {"default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 1}), - } - } + def define_schema(cls): + return IO.Schema( + node_id="SolidMask", + category="mask", + inputs=[ + IO.Float.Input("value", default=1.0, min=0.0, max=1.0, step=0.01), + IO.Int.Input("width", default=512, min=1, max=nodes.MAX_RESOLUTION, step=1), + IO.Int.Input("height", default=512, min=1, max=nodes.MAX_RESOLUTION, step=1), + ], + outputs=[IO.Mask.Output()], + ) - CATEGORY = "mask" - - RETURN_TYPES = ("MASK",) - - FUNCTION = "solid" - - def solid(self, value, width, height): + @classmethod + def execute(cls, value, width, height) -> IO.NodeOutput: out = torch.full((1, height, width), value, dtype=torch.float32, device="cpu") - return (out,) + return IO.NodeOutput(out) -class InvertMask: + solid = execute # TODO: remove + + +class InvertMask(IO.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "mask": ("MASK",), - } - } + def define_schema(cls): + return IO.Schema( + node_id="InvertMask", + category="mask", + inputs=[ + IO.Mask.Input("mask"), + ], + outputs=[IO.Mask.Output()], + ) - CATEGORY = "mask" - - RETURN_TYPES = ("MASK",) - - FUNCTION = "invert" - - def invert(self, mask): + @classmethod + def execute(cls, mask) -> IO.NodeOutput: out = 1.0 - mask - return (out,) + return IO.NodeOutput(out) -class CropMask: + invert = execute # TODO: remove + + +class CropMask(IO.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "mask": ("MASK",), - "x": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 1}), - "y": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 1}), - "width": ("INT", {"default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 1}), - "height": ("INT", {"default": 512, "min": 1, "max": MAX_RESOLUTION, "step": 1}), - } - } + def define_schema(cls): + return IO.Schema( + node_id="CropMask", + category="mask", + inputs=[ + IO.Mask.Input("mask"), + IO.Int.Input("x", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1), + IO.Int.Input("y", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1), + IO.Int.Input("width", default=512, min=1, max=nodes.MAX_RESOLUTION, step=1), + IO.Int.Input("height", default=512, min=1, max=nodes.MAX_RESOLUTION, step=1), + ], + outputs=[IO.Mask.Output()], + ) - CATEGORY = "mask" - - RETURN_TYPES = ("MASK",) - - FUNCTION = "crop" - - def crop(self, mask, x, y, width, height): + @classmethod + def execute(cls, mask, x, y, width, height) -> IO.NodeOutput: mask = mask.reshape((-1, mask.shape[-2], mask.shape[-1])) out = mask[:, y:y + height, x:x + width] - return (out,) + return IO.NodeOutput(out) -class MaskComposite: + crop = execute # TODO: remove + + +class MaskComposite(IO.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "destination": ("MASK",), - "source": ("MASK",), - "x": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 1}), - "y": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 1}), - "operation": (["multiply", "add", "subtract", "and", "or", "xor"],), - } - } + def define_schema(cls): + return IO.Schema( + node_id="MaskComposite", + category="mask", + inputs=[ + IO.Mask.Input("destination"), + IO.Mask.Input("source"), + IO.Int.Input("x", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1), + IO.Int.Input("y", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1), + IO.Combo.Input("operation", options=["multiply", "add", "subtract", "and", "or", "xor"]), + ], + outputs=[IO.Mask.Output()], + ) - CATEGORY = "mask" - - RETURN_TYPES = ("MASK",) - - FUNCTION = "combine" - - def combine(self, destination, source, x, y, operation): + @classmethod + def execute(cls, destination, source, x, y, operation) -> IO.NodeOutput: output = destination.reshape((-1, destination.shape[-2], destination.shape[-1])).clone() source = source.reshape((-1, source.shape[-2], source.shape[-1])) @@ -267,28 +277,29 @@ class MaskComposite: output = torch.clamp(output, 0.0, 1.0) - return (output,) + return IO.NodeOutput(output) -class FeatherMask: + combine = execute # TODO: remove + + +class FeatherMask(IO.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "mask": ("MASK",), - "left": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 1}), - "top": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 1}), - "right": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 1}), - "bottom": ("INT", {"default": 0, "min": 0, "max": MAX_RESOLUTION, "step": 1}), - } - } + def define_schema(cls): + return IO.Schema( + node_id="FeatherMask", + category="mask", + inputs=[ + IO.Mask.Input("mask"), + IO.Int.Input("left", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1), + IO.Int.Input("top", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1), + IO.Int.Input("right", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1), + IO.Int.Input("bottom", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1), + ], + outputs=[IO.Mask.Output()], + ) - CATEGORY = "mask" - - RETURN_TYPES = ("MASK",) - - FUNCTION = "feather" - - def feather(self, mask, left, top, right, bottom): + @classmethod + def execute(cls, mask, left, top, right, bottom) -> IO.NodeOutput: output = mask.reshape((-1, mask.shape[-2], mask.shape[-1])).clone() left = min(left, output.shape[-1]) @@ -312,26 +323,28 @@ class FeatherMask: feather_rate = (y + 1) / bottom output[:, -y, :] *= feather_rate - return (output,) + return IO.NodeOutput(output) -class GrowMask: + feather = execute # TODO: remove + + +class GrowMask(IO.ComfyNode): @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "mask": ("MASK",), - "expand": ("INT", {"default": 0, "min": -MAX_RESOLUTION, "max": MAX_RESOLUTION, "step": 1}), - "tapered_corners": ("BOOLEAN", {"default": True}), - }, - } + def define_schema(cls): + return IO.Schema( + node_id="GrowMask", + display_name="Grow Mask", + category="mask", + inputs=[ + IO.Mask.Input("mask"), + IO.Int.Input("expand", default=0, min=-nodes.MAX_RESOLUTION, max=nodes.MAX_RESOLUTION, step=1), + IO.Boolean.Input("tapered_corners", default=True), + ], + outputs=[IO.Mask.Output()], + ) - CATEGORY = "mask" - - RETURN_TYPES = ("MASK",) - - FUNCTION = "expand_mask" - - def expand_mask(self, mask, expand, tapered_corners): + @classmethod + def execute(cls, mask, expand, tapered_corners) -> IO.NodeOutput: c = 0 if tapered_corners else 1 kernel = np.array([[c, 1, c], [1, 1, 1], @@ -347,69 +360,74 @@ class GrowMask: output = scipy.ndimage.grey_dilation(output, footprint=kernel) output = torch.from_numpy(output) out.append(output) - return (torch.stack(out, dim=0),) + return IO.NodeOutput(torch.stack(out, dim=0)) -class ThresholdMask: + expand_mask = execute # TODO: remove + + +class ThresholdMask(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "mask": ("MASK",), - "value": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 1.0, "step": 0.01}), - } - } + def define_schema(cls): + return IO.Schema( + node_id="ThresholdMask", + category="mask", + inputs=[ + IO.Mask.Input("mask"), + IO.Float.Input("value", default=0.5, min=0.0, max=1.0, step=0.01), + ], + outputs=[IO.Mask.Output()], + ) - CATEGORY = "mask" - - RETURN_TYPES = ("MASK",) - FUNCTION = "image_to_mask" - - def image_to_mask(self, mask, value): + @classmethod + def execute(cls, mask, value) -> IO.NodeOutput: mask = (mask > value).float() - return (mask,) + return IO.NodeOutput(mask) + + image_to_mask = execute # TODO: remove + # Mask Preview - original implement from # https://github.com/cubiq/ComfyUI_essentials/blob/9d9f4bedfc9f0321c19faf71855e228c93bd0dc9/mask.py#L81 # upstream requested in https://github.com/Kosinkadink/rfcs/blob/main/rfcs/0000-corenodes.md#preview-nodes -class MaskPreview(nodes.SaveImage): - def __init__(self): - self.output_dir = folder_paths.get_temp_directory() - self.type = "temp" - self.prefix_append = "_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for x in range(5)) - self.compress_level = 4 +class MaskPreview(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="MaskPreview", + display_name="Preview Mask", + category="mask", + description="Saves the input images to your ComfyUI output directory.", + inputs=[ + IO.Mask.Input("mask"), + ], + hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo], + is_output_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": {"mask": ("MASK",), }, - "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, - } - - FUNCTION = "execute" - CATEGORY = "mask" - - def execute(self, mask, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None): - preview = mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])).movedim(1, -1).expand(-1, -1, -1, 3) - return self.save_images(preview, filename_prefix, prompt, extra_pnginfo) + def execute(cls, mask, filename_prefix="ComfyUI") -> IO.NodeOutput: + return IO.NodeOutput(ui=UI.PreviewMask(mask)) -NODE_CLASS_MAPPINGS = { - "LatentCompositeMasked": LatentCompositeMasked, - "ImageCompositeMasked": ImageCompositeMasked, - "MaskToImage": MaskToImage, - "ImageToMask": ImageToMask, - "ImageColorToMask": ImageColorToMask, - "SolidMask": SolidMask, - "InvertMask": InvertMask, - "CropMask": CropMask, - "MaskComposite": MaskComposite, - "FeatherMask": FeatherMask, - "GrowMask": GrowMask, - "ThresholdMask": ThresholdMask, - "MaskPreview": MaskPreview -} +class MaskExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + LatentCompositeMasked, + ImageCompositeMasked, + MaskToImage, + ImageToMask, + ImageColorToMask, + SolidMask, + InvertMask, + CropMask, + MaskComposite, + FeatherMask, + GrowMask, + ThresholdMask, + MaskPreview, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "ImageToMask": "Convert Image to Mask", - "MaskToImage": "Convert Mask to Image", -} + +async def comfy_entrypoint() -> MaskExtension: + return MaskExtension() From d7a0aef65033bf0fe56e521577a44fac1830b8b3 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 5 Dec 2025 21:15:21 -0800 Subject: [PATCH 697/759] Set OCL_SET_SVM_SIZE on AMD. (#11139) --- cuda_malloc.py | 27 +++++++++++++++++---------- main.py | 3 +++ 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/cuda_malloc.py b/cuda_malloc.py index 6520d5123..ee2bc4b69 100644 --- a/cuda_malloc.py +++ b/cuda_malloc.py @@ -63,18 +63,22 @@ def cuda_malloc_supported(): return True +version = "" + +try: + torch_spec = importlib.util.find_spec("torch") + for folder in torch_spec.submodule_search_locations: + ver_file = os.path.join(folder, "version.py") + if os.path.isfile(ver_file): + spec = importlib.util.spec_from_file_location("torch_version_import", ver_file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + version = module.__version__ +except: + pass + if not args.cuda_malloc: try: - version = "" - torch_spec = importlib.util.find_spec("torch") - for folder in torch_spec.submodule_search_locations: - ver_file = os.path.join(folder, "version.py") - if os.path.isfile(ver_file): - spec = importlib.util.spec_from_file_location("torch_version_import", ver_file) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - version = module.__version__ - if int(version[0]) >= 2 and "+cu" in version: # enable by default for torch version 2.0 and up only on cuda torch if PerformanceFeature.AutoTune not in args.fast: # Autotune has issues with cuda malloc args.cuda_malloc = cuda_malloc_supported() @@ -90,3 +94,6 @@ if args.cuda_malloc and not args.disable_cuda_malloc: env_var += ",backend:cudaMallocAsync" os.environ['PYTORCH_CUDA_ALLOC_CONF'] = env_var + +def get_torch_version_noimport(): + return str(version) diff --git a/main.py b/main.py index 0cd815d9e..0d02a087b 100644 --- a/main.py +++ b/main.py @@ -167,6 +167,9 @@ if __name__ == "__main__": os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8" import cuda_malloc + if "rocm" in cuda_malloc.get_torch_version_noimport(): + os.environ['OCL_SET_SVM_SIZE'] = '262144' # set at the request of AMD + if 'torch' in sys.modules: logging.warning("WARNING: Potential Error in code: Torch already imported, torch should never be imported before this point.") From 76f18e955dcbc88ed13d6802194fd897927f93e5 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Sat, 6 Dec 2025 13:28:08 +0200 Subject: [PATCH 698/759] marked all Pika API nodes a deprecated (#11146) --- comfy_api_nodes/nodes_pika.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py index 51148211b..acd88c391 100644 --- a/comfy_api_nodes/nodes_pika.py +++ b/comfy_api_nodes/nodes_pika.py @@ -92,6 +92,7 @@ class PikaImageToVideo(IO.ComfyNode): IO.Hidden.unique_id, ], is_api_node=True, + is_deprecated=True, ) @classmethod @@ -152,6 +153,7 @@ class PikaTextToVideoNode(IO.ComfyNode): IO.Hidden.unique_id, ], is_api_node=True, + is_deprecated=True, ) @classmethod @@ -239,6 +241,7 @@ class PikaScenes(IO.ComfyNode): IO.Hidden.unique_id, ], is_api_node=True, + is_deprecated=True, ) @classmethod @@ -323,6 +326,7 @@ class PikAdditionsNode(IO.ComfyNode): IO.Hidden.unique_id, ], is_api_node=True, + is_deprecated=True, ) @classmethod @@ -399,6 +403,7 @@ class PikaSwapsNode(IO.ComfyNode): IO.Hidden.unique_id, ], is_api_node=True, + is_deprecated=True, ) @classmethod @@ -466,6 +471,7 @@ class PikaffectsNode(IO.ComfyNode): IO.Hidden.unique_id, ], is_api_node=True, + is_deprecated=True, ) @classmethod @@ -515,6 +521,7 @@ class PikaStartEndFrameNode(IO.ComfyNode): IO.Hidden.unique_id, ], is_api_node=True, + is_deprecated=True, ) @classmethod From 7ac7d69d948e75c3a230d1262daab84d75aff895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Sat, 6 Dec 2025 20:09:44 +0200 Subject: [PATCH 699/759] Fix EmptyAudio node input types (#11149) --- comfy_extras/nodes_audio.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 812301fb7..c7916443c 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -573,12 +573,14 @@ class EmptyAudio(IO.ComfyNode): step=0.01, tooltip="Duration of the empty audio clip in seconds", ), - IO.Float.Input( + IO.Int.Input( "sample_rate", default=44100, tooltip="Sample rate of the empty audio clip.", + min=1, + max=192000, ), - IO.Float.Input( + IO.Int.Input( "channels", default=2, min=1, From 50ca97e7765d9bbdbeec31a75f1f6c747d76948c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 6 Dec 2025 15:36:20 -0800 Subject: [PATCH 700/759] Speed up lora compute and lower memory usage by doing it in fp16. (#11161) --- comfy/model_management.py | 14 ++++++++++++++ comfy/model_patcher.py | 5 +++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index aeddbaefe..40717b1e4 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1492,6 +1492,20 @@ def extended_fp16_support(): return True +LORA_COMPUTE_DTYPES = {} +def lora_compute_dtype(device): + dtype = LORA_COMPUTE_DTYPES.get(device, None) + if dtype is not None: + return dtype + + if should_use_fp16(device): + dtype = torch.float16 + else: + dtype = torch.float32 + + LORA_COMPUTE_DTYPES[device] = dtype + return dtype + def soft_empty_cache(force=False): global cpu_state if cpu_state == CPUState.MPS: diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 215784874..4f076a6aa 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -614,10 +614,11 @@ class ModelPatcher: if key not in self.backup: self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update) + temp_dtype = comfy.model_management.lora_compute_dtype(device_to) if device_to is not None: - temp_weight = comfy.model_management.cast_to_device(weight, device_to, torch.float32, copy=True) + temp_weight = comfy.model_management.cast_to_device(weight, device_to, temp_dtype, copy=True) else: - temp_weight = weight.to(torch.float32, copy=True) + temp_weight = weight.to(temp_dtype, copy=True) if convert_func is not None: temp_weight = convert_func(temp_weight, inplace=True) From 4086acf3c2f0ca3a8861b04f6179fa9f908e3e25 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Sun, 7 Dec 2025 09:42:09 +1000 Subject: [PATCH 701/759] Fix on-load VRAM OOM (#11144) slow down the CPU on model load to not run ahead. This fixes a VRAM on flux 2 load. I went to try and debug this with the memory trace pickles, which needs --disable-cuda-malloc which made the bug go away. So I tried this synchronize and it worked. The has some very complex interactions with the cuda malloc async and I dont have solid theory on this one yet. Still debugging but this gets us over the OOM for the moment. --- comfy/model_patcher.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 4f076a6aa..5b1ccb824 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -762,6 +762,8 @@ class ModelPatcher: key = "{}.{}".format(n, param) self.unpin_weight(key) self.patch_weight_to_device(key, device_to=device_to) + if comfy.model_management.is_device_cuda(device_to): + torch.cuda.synchronize() logging.debug("lowvram: loaded module regularly {} {}".format(n, m)) m.comfy_patched_weights = True From 329480da5ab32949a411548f821ea60ab3e90dc7 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 6 Dec 2025 17:50:10 -0800 Subject: [PATCH 702/759] Fix qwen scaled fp8 not working with kandinsky. Make basic t2i wf work. (#11162) --- comfy/ldm/kandinsky5/model.py | 8 +++++++- comfy/text_encoders/kandinsky5.py | 12 ++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/comfy/ldm/kandinsky5/model.py b/comfy/ldm/kandinsky5/model.py index a653e02fc..1509de2f8 100644 --- a/comfy/ldm/kandinsky5/model.py +++ b/comfy/ldm/kandinsky5/model.py @@ -387,6 +387,9 @@ class Kandinsky5(nn.Module): return self.out_layer(visual_embed, time_embed) def _forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options={}, **kwargs): + original_dims = x.ndim + if original_dims == 4: + x = x.unsqueeze(2) bs, c, t_len, h, w = x.shape x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size) @@ -397,7 +400,10 @@ class Kandinsky5(nn.Module): freqs = self.rope_encode_3d(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options) freqs_text = self.rope_encode_1d(context.shape[1], device=x.device, dtype=x.dtype, transformer_options=transformer_options) - return self.forward_orig(x, timestep, context, y, freqs, freqs_text, transformer_options=transformer_options, **kwargs) + out = self.forward_orig(x, timestep, context, y, freqs, freqs_text, transformer_options=transformer_options, **kwargs) + if original_dims == 4: + out = out.squeeze(2) + return out def forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options={}, **kwargs): return comfy.patcher_extension.WrapperExecutor.new_class_executor( diff --git a/comfy/text_encoders/kandinsky5.py b/comfy/text_encoders/kandinsky5.py index 22f991c36..be086458c 100644 --- a/comfy/text_encoders/kandinsky5.py +++ b/comfy/text_encoders/kandinsky5.py @@ -24,10 +24,10 @@ class Kandinsky5TokenizerImage(Kandinsky5Tokenizer): class Qwen25_7BVLIModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="hidden", layer_idx=-1, dtype=None, attention_mask=True, model_options={}): - llama_scaled_fp8 = model_options.get("qwen_scaled_fp8", None) - if llama_scaled_fp8 is not None: + llama_quantization_metadata = model_options.get("llama_quantization_metadata", None) + if llama_quantization_metadata is not None: model_options = model_options.copy() - model_options["scaled_fp8"] = llama_scaled_fp8 + model_options["quantization_metadata"] = llama_quantization_metadata super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) @@ -56,12 +56,12 @@ class Kandinsky5TEModel(QwenImageTEModel): else: return super().load_sd(sd) -def te(dtype_llama=None, llama_scaled_fp8=None): +def te(dtype_llama=None, llama_quantization_metadata=None): class Kandinsky5TEModel_(Kandinsky5TEModel): def __init__(self, device="cpu", dtype=None, model_options={}): - if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: + if llama_quantization_metadata is not None: model_options = model_options.copy() - model_options["qwen_scaled_fp8"] = llama_scaled_fp8 + model_options["llama_quantization_metadata"] = llama_quantization_metadata if dtype_llama is not None: dtype = dtype_llama super().__init__(device=device, dtype=dtype, model_options=model_options) From 56fa7dbe380cb5591c5542f8aa51ce2fc26beedf Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 7 Dec 2025 04:44:55 -0800 Subject: [PATCH 703/759] Properly load the newbie diffusion model. (#11172) There is still one of the text encoders missing and I didn't actually test it. --- comfy/ldm/lumina/model.py | 35 +++++++++++++++++++++++++++++++++++ comfy/model_base.py | 4 ++++ comfy/model_detection.py | 3 +++ 3 files changed, 42 insertions(+) diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index 6c24fed9b..c47df49ca 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -377,6 +377,7 @@ class NextDiT(nn.Module): z_image_modulation=False, time_scale=1.0, pad_tokens_multiple=None, + clip_text_dim=None, image_model=None, device=None, dtype=None, @@ -447,6 +448,31 @@ class NextDiT(nn.Module): ), ) + self.clip_text_pooled_proj = None + + if clip_text_dim is not None: + self.clip_text_dim = clip_text_dim + self.clip_text_pooled_proj = nn.Sequential( + operation_settings.get("operations").RMSNorm(clip_text_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")), + operation_settings.get("operations").Linear( + clip_text_dim, + clip_text_dim, + bias=True, + device=operation_settings.get("device"), + dtype=operation_settings.get("dtype"), + ), + ) + self.time_text_embed = nn.Sequential( + nn.SiLU(), + operation_settings.get("operations").Linear( + min(dim, 1024) + clip_text_dim, + min(dim, 1024), + bias=True, + device=operation_settings.get("device"), + dtype=operation_settings.get("dtype"), + ), + ) + self.layers = nn.ModuleList( [ JointTransformerBlock( @@ -585,6 +611,15 @@ class NextDiT(nn.Module): cap_feats = self.cap_embedder(cap_feats) # (N, L, D) # todo check if able to batchify w.o. redundant compute + if self.clip_text_pooled_proj is not None: + pooled = kwargs.get("clip_text_pooled", None) + if pooled is not None: + pooled = self.clip_text_pooled_proj(pooled) + else: + pooled = torch.zeros((1, self.clip_text_dim), device=x.device, dtype=x.dtype) + + adaln_input = self.time_text_embed(torch.cat((t, pooled), dim=-1)) + patches = transformer_options.get("patches", {}) x_is_tensor = isinstance(x, torch.Tensor) img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options) diff --git a/comfy/model_base.py b/comfy/model_base.py index 0be006cc2..6b8a8454d 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1110,6 +1110,10 @@ class Lumina2(BaseModel): if 'num_tokens' not in out: out['num_tokens'] = comfy.conds.CONDConstant(cross_attn.shape[1]) + clip_text_pooled = kwargs["pooled_output"] # Newbie + if clip_text_pooled is not None: + out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled) + return out class WAN21(BaseModel): diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 30b33a486..74c547427 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -423,6 +423,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["axes_lens"] = [300, 512, 512] dit_config["rope_theta"] = 10000.0 dit_config["ffn_dim_multiplier"] = 4.0 + ctd_weight = state_dict.get('{}clip_text_pooled_proj.0.weight'.format(key_prefix), None) + if ctd_weight is not None: + dit_config["clip_text_dim"] = ctd_weight.shape[0] elif dit_config["dim"] == 3840: # Z image dit_config["n_heads"] = 30 dit_config["n_kv_heads"] = 30 From ec7f65187d85e22ea23345ce0d919e11768f255e Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 8 Dec 2025 11:21:41 +0200 Subject: [PATCH 704/759] chore(comfy_api): replace absolute imports with relative (#11145) --- comfy_api/latest/__init__.py | 8 ++++---- comfy_api/latest/_input/video_types.py | 2 +- comfy_api/latest/_input_impl/video_types.py | 4 ++-- comfy_api/latest/_io.py | 2 +- comfy_api/latest/_ui.py | 2 +- comfy_api/latest/_util/video_types.py | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py index 0fa01d1e7..35e1ac853 100644 --- a/comfy_api/latest/__init__.py +++ b/comfy_api/latest/__init__.py @@ -5,9 +5,9 @@ from typing import Type, TYPE_CHECKING from comfy_api.internal import ComfyAPIBase from comfy_api.internal.singleton import ProxiedSingleton from comfy_api.internal.async_to_sync import create_sync_class -from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput -from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents -from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL +from ._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput +from ._input_impl import VideoFromFile, VideoFromComponents +from ._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL from . import _io_public as io from . import _ui_public as ui # from comfy_api.latest._resources import _RESOURCES as resources #noqa: F401 @@ -80,7 +80,7 @@ class ComfyExtension(ABC): async def on_load(self) -> None: """ Called when an extension is loaded. - This should be used to initialize any global resources neeeded by the extension. + This should be used to initialize any global resources needed by the extension. """ @abstractmethod diff --git a/comfy_api/latest/_input/video_types.py b/comfy_api/latest/_input/video_types.py index 87c81d73a..e634a0311 100644 --- a/comfy_api/latest/_input/video_types.py +++ b/comfy_api/latest/_input/video_types.py @@ -4,7 +4,7 @@ from fractions import Fraction from typing import Optional, Union, IO import io import av -from comfy_api.util import VideoContainer, VideoCodec, VideoComponents +from .._util import VideoContainer, VideoCodec, VideoComponents class VideoInput(ABC): """ diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py index a4cd3737d..ea35c6062 100644 --- a/comfy_api/latest/_input_impl/video_types.py +++ b/comfy_api/latest/_input_impl/video_types.py @@ -3,14 +3,14 @@ from av.container import InputContainer from av.subtitles.stream import SubtitleStream from fractions import Fraction from typing import Optional -from comfy_api.latest._input import AudioInput, VideoInput +from .._input import AudioInput, VideoInput import av import io import json import numpy as np import math import torch -from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents +from .._util import VideoContainer, VideoCodec, VideoComponents def container_to_output_format(container_format: str | None) -> str | None: diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index d7cbe68cf..313a5af20 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -26,7 +26,7 @@ if TYPE_CHECKING: from comfy_api.input import VideoInput from comfy_api.internal import (_ComfyNodeInternal, _NodeOutputInternal, classproperty, copy_class, first_real_override, is_class, prune_dict, shallow_clone_class) -from comfy_api.latest._resources import Resources, ResourcesLocal +from ._resources import Resources, ResourcesLocal from comfy_execution.graph_utils import ExecutionBlocker from ._util import MESH, VOXEL diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py index 5a75a3aae..2babe209a 100644 --- a/comfy_api/latest/_ui.py +++ b/comfy_api/latest/_ui.py @@ -22,7 +22,7 @@ import folder_paths # used for image preview from comfy.cli_args import args -from comfy_api.latest._io import ComfyNode, FolderType, Image, _UIOutput +from ._io import ComfyNode, FolderType, Image, _UIOutput class SavedResult(dict): diff --git a/comfy_api/latest/_util/video_types.py b/comfy_api/latest/_util/video_types.py index c3e3d8e3a..fd3b5a510 100644 --- a/comfy_api/latest/_util/video_types.py +++ b/comfy_api/latest/_util/video_types.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from enum import Enum from fractions import Fraction from typing import Optional -from comfy_api.latest._input import ImageInput, AudioInput +from .._input import ImageInput, AudioInput class VideoCodec(str, Enum): AUTO = "auto" From 058f084371ef2ed0c456118dfdd3d0bfed17259b Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Mon, 8 Dec 2025 17:22:51 +0800 Subject: [PATCH 705/759] Update workflow templates to v0.7.51 (#11150) * chore: update workflow templates to v0.7.50 * Update template to 0.7.51 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f98848e20..12a7c1089 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.33.10 -comfyui-workflow-templates==0.7.25 +comfyui-workflow-templates==0.7.51 comfyui-embedded-docs==0.3.1 torch torchsde From 85c4b4ae262c2de360891dd23c6504da2f5a6014 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 8 Dec 2025 11:27:02 +0200 Subject: [PATCH 706/759] chore: replace imports of deprecated V1 classes (#11127) --- comfy_api_nodes/apis/veo_api.py | 2 +- comfy_api_nodes/nodes_gemini.py | 19 ++++++++++--------- comfy_api_nodes/nodes_ltxv.py | 17 +++++++---------- comfy_api_nodes/nodes_moonvalley.py | 19 ++++++++----------- comfy_api_nodes/nodes_runway.py | 29 +++++++++++++---------------- comfy_api_nodes/nodes_veo2.py | 12 +++++------- comfy_extras/nodes_video.py | 27 +++++++++++---------------- 7 files changed, 55 insertions(+), 70 deletions(-) diff --git a/comfy_api_nodes/apis/veo_api.py b/comfy_api_nodes/apis/veo_api.py index 8328d1aa4..23ca725b7 100644 --- a/comfy_api_nodes/apis/veo_api.py +++ b/comfy_api_nodes/apis/veo_api.py @@ -85,7 +85,7 @@ class Response1(BaseModel): raiMediaFilteredReasons: Optional[list[str]] = Field( None, description='Reasons why media was filtered by responsible AI policies' ) - videos: Optional[list[Video]] = None + videos: Optional[list[Video]] = Field(None) class VeoGenVidPollResponse(BaseModel): diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 08f7b0f64..0b7422ef7 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -13,8 +13,7 @@ import torch from typing_extensions import override import folder_paths -from comfy_api.latest import IO, ComfyExtension, Input -from comfy_api.util import VideoCodec, VideoContainer +from comfy_api.latest import IO, ComfyExtension, Input, Types from comfy_api_nodes.apis.gemini_api import ( GeminiContent, GeminiFileData, @@ -68,7 +67,7 @@ class GeminiImageModel(str, Enum): async def create_image_parts( cls: type[IO.ComfyNode], - images: torch.Tensor, + images: Input.Image, image_limit: int = 0, ) -> list[GeminiPart]: image_parts: list[GeminiPart] = [] @@ -154,8 +153,8 @@ def get_text_from_response(response: GeminiGenerateContentResponse) -> str: return "\n".join([part.text for part in parts]) -def get_image_from_response(response: GeminiGenerateContentResponse) -> torch.Tensor: - image_tensors: list[torch.Tensor] = [] +def get_image_from_response(response: GeminiGenerateContentResponse) -> Input.Image: + image_tensors: list[Input.Image] = [] parts = get_parts_by_type(response, "image/png") for part in parts: image_data = base64.b64decode(part.inlineData.data) @@ -293,7 +292,9 @@ class GeminiNode(IO.ComfyNode): def create_video_parts(cls, video_input: Input.Video) -> list[GeminiPart]: """Convert video input to Gemini API compatible parts.""" - base_64_string = video_to_base64_string(video_input, container_format=VideoContainer.MP4, codec=VideoCodec.H264) + base_64_string = video_to_base64_string( + video_input, container_format=Types.VideoContainer.MP4, codec=Types.VideoCodec.H264 + ) return [ GeminiPart( inlineData=GeminiInlineData( @@ -343,7 +344,7 @@ class GeminiNode(IO.ComfyNode): prompt: str, model: str, seed: int, - images: torch.Tensor | None = None, + images: Input.Image | None = None, audio: Input.Audio | None = None, video: Input.Video | None = None, files: list[GeminiPart] | None = None, @@ -542,7 +543,7 @@ class GeminiImage(IO.ComfyNode): prompt: str, model: str, seed: int, - images: torch.Tensor | None = None, + images: Input.Image | None = None, files: list[GeminiPart] | None = None, aspect_ratio: str = "auto", response_modalities: str = "IMAGE+TEXT", @@ -662,7 +663,7 @@ class GeminiImage2(IO.ComfyNode): aspect_ratio: str, resolution: str, response_modalities: str, - images: torch.Tensor | None = None, + images: Input.Image | None = None, files: list[GeminiPart] | None = None, ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) diff --git a/comfy_api_nodes/nodes_ltxv.py b/comfy_api_nodes/nodes_ltxv.py index 0b757a62b..7e61560dc 100644 --- a/comfy_api_nodes/nodes_ltxv.py +++ b/comfy_api_nodes/nodes_ltxv.py @@ -1,12 +1,9 @@ from io import BytesIO -from typing import Optional -import torch from pydantic import BaseModel, Field from typing_extensions import override -from comfy_api.input_impl import VideoFromFile -from comfy_api.latest import IO, ComfyExtension +from comfy_api.latest import IO, ComfyExtension, Input, InputImpl from comfy_api_nodes.util import ( ApiEndpoint, get_number_of_images, @@ -26,9 +23,9 @@ class ExecuteTaskRequest(BaseModel): model: str = Field(...) duration: int = Field(...) resolution: str = Field(...) - fps: Optional[int] = Field(25) - generate_audio: Optional[bool] = Field(True) - image_uri: Optional[str] = Field(None) + fps: int | None = Field(25) + generate_audio: bool | None = Field(True) + image_uri: str | None = Field(None) class TextToVideoNode(IO.ComfyNode): @@ -103,7 +100,7 @@ class TextToVideoNode(IO.ComfyNode): as_binary=True, max_retries=1, ) - return IO.NodeOutput(VideoFromFile(BytesIO(response))) + return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(response))) class ImageToVideoNode(IO.ComfyNode): @@ -153,7 +150,7 @@ class ImageToVideoNode(IO.ComfyNode): @classmethod async def execute( cls, - image: torch.Tensor, + image: Input.Image, model: str, prompt: str, duration: int, @@ -183,7 +180,7 @@ class ImageToVideoNode(IO.ComfyNode): as_binary=True, max_retries=1, ) - return IO.NodeOutput(VideoFromFile(BytesIO(response))) + return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(response))) class LtxvApiExtension(ComfyExtension): diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 7c31d95b3..2771e4790 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -1,11 +1,8 @@ import logging -from typing import Optional -import torch from typing_extensions import override -from comfy_api.input import VideoInput -from comfy_api.latest import IO, ComfyExtension +from comfy_api.latest import IO, ComfyExtension, Input from comfy_api_nodes.apis import ( MoonvalleyPromptResponse, MoonvalleyTextToVideoInferenceParams, @@ -61,7 +58,7 @@ def validate_task_creation_response(response) -> None: raise RuntimeError(error_msg) -def validate_video_to_video_input(video: VideoInput) -> VideoInput: +def validate_video_to_video_input(video: Input.Video) -> Input.Video: """ Validates and processes video input for Moonvalley Video-to-Video generation. @@ -82,7 +79,7 @@ def validate_video_to_video_input(video: VideoInput) -> VideoInput: return _validate_and_trim_duration(video) -def _get_video_dimensions(video: VideoInput) -> tuple[int, int]: +def _get_video_dimensions(video: Input.Video) -> tuple[int, int]: """Extracts video dimensions with error handling.""" try: return video.get_dimensions() @@ -106,7 +103,7 @@ def _validate_video_dimensions(width: int, height: int) -> None: raise ValueError(f"Resolution {width}x{height} not supported. Supported: {supported_list}") -def _validate_and_trim_duration(video: VideoInput) -> VideoInput: +def _validate_and_trim_duration(video: Input.Video) -> Input.Video: """Validates video duration and trims to 5 seconds if needed.""" duration = video.get_duration() _validate_minimum_duration(duration) @@ -119,7 +116,7 @@ def _validate_minimum_duration(duration: float) -> None: raise ValueError("Input video must be at least 5 seconds long.") -def _trim_if_too_long(video: VideoInput, duration: float) -> VideoInput: +def _trim_if_too_long(video: Input.Video, duration: float) -> Input.Video: """Trims video to 5 seconds if longer.""" if duration > 5: return trim_video(video, 5) @@ -241,7 +238,7 @@ class MoonvalleyImg2VideoNode(IO.ComfyNode): @classmethod async def execute( cls, - image: torch.Tensor, + image: Input.Image, prompt: str, negative_prompt: str, resolution: str, @@ -362,9 +359,9 @@ class MoonvalleyVideo2VideoNode(IO.ComfyNode): prompt: str, negative_prompt: str, seed: int, - video: Optional[VideoInput] = None, + video: Input.Video | None = None, control_type: str = "Motion Transfer", - motion_intensity: Optional[int] = 100, + motion_intensity: int | None = 100, steps=33, prompt_adherence=4.5, ) -> IO.NodeOutput: diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py index 2fdafbbfe..3c55039c9 100644 --- a/comfy_api_nodes/nodes_runway.py +++ b/comfy_api_nodes/nodes_runway.py @@ -11,12 +11,11 @@ User Guides: """ -from typing import Union, Optional -from typing_extensions import override from enum import Enum -import torch +from typing_extensions import override +from comfy_api.latest import IO, ComfyExtension, Input, InputImpl from comfy_api_nodes.apis import ( RunwayImageToVideoRequest, RunwayImageToVideoResponse, @@ -44,8 +43,6 @@ from comfy_api_nodes.util import ( sync_op, poll_op, ) -from comfy_api.input_impl import VideoFromFile -from comfy_api.latest import ComfyExtension, IO PATH_IMAGE_TO_VIDEO = "/proxy/runway/image_to_video" PATH_TEXT_TO_IMAGE = "/proxy/runway/text_to_image" @@ -80,7 +77,7 @@ class RunwayGen3aAspectRatio(str, Enum): field_1280_768 = "1280:768" -def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]: +def get_video_url_from_task_status(response: TaskStatusResponse) -> str | None: """Returns the video URL from the task status response if it exists.""" if hasattr(response, "output") and len(response.output) > 0: return response.output[0] @@ -89,13 +86,13 @@ def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, N def extract_progress_from_task_status( response: TaskStatusResponse, -) -> Union[float, None]: +) -> float | None: if hasattr(response, "progress") and response.progress is not None: return response.progress * 100 return None -def get_image_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]: +def get_image_url_from_task_status(response: TaskStatusResponse) -> str | None: """Returns the image URL from the task status response if it exists.""" if hasattr(response, "output") and len(response.output) > 0: return response.output[0] @@ -103,7 +100,7 @@ def get_image_url_from_task_status(response: TaskStatusResponse) -> Union[str, N async def get_response( - cls: type[IO.ComfyNode], task_id: str, estimated_duration: Optional[int] = None + cls: type[IO.ComfyNode], task_id: str, estimated_duration: int | None = None ) -> TaskStatusResponse: """Poll the task status until it is finished then get the response.""" return await poll_op( @@ -119,8 +116,8 @@ async def get_response( async def generate_video( cls: type[IO.ComfyNode], request: RunwayImageToVideoRequest, - estimated_duration: Optional[int] = None, -) -> VideoFromFile: + estimated_duration: int | None = None, +) -> InputImpl.VideoFromFile: initial_response = await sync_op( cls, endpoint=ApiEndpoint(path=PATH_IMAGE_TO_VIDEO, method="POST"), @@ -193,7 +190,7 @@ class RunwayImageToVideoNodeGen3a(IO.ComfyNode): async def execute( cls, prompt: str, - start_frame: torch.Tensor, + start_frame: Input.Image, duration: str, ratio: str, seed: int, @@ -283,7 +280,7 @@ class RunwayImageToVideoNodeGen4(IO.ComfyNode): async def execute( cls, prompt: str, - start_frame: torch.Tensor, + start_frame: Input.Image, duration: str, ratio: str, seed: int, @@ -381,8 +378,8 @@ class RunwayFirstLastFrameNode(IO.ComfyNode): async def execute( cls, prompt: str, - start_frame: torch.Tensor, - end_frame: torch.Tensor, + start_frame: Input.Image, + end_frame: Input.Image, duration: str, ratio: str, seed: int, @@ -467,7 +464,7 @@ class RunwayTextToImageNode(IO.ComfyNode): cls, prompt: str, ratio: str, - reference_image: Optional[torch.Tensor] = None, + reference_image: Input.Image | None = None, ) -> IO.NodeOutput: validate_string(prompt, min_length=1) diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index a54dc13ab..e165b8380 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -1,11 +1,9 @@ import base64 from io import BytesIO -import torch from typing_extensions import override -from comfy_api.input_impl.video_types import VideoFromFile -from comfy_api.latest import IO, ComfyExtension +from comfy_api.latest import IO, ComfyExtension, Input, InputImpl from comfy_api_nodes.apis.veo_api import ( VeoGenVidPollRequest, VeoGenVidPollResponse, @@ -232,7 +230,7 @@ class VeoVideoGenerationNode(IO.ComfyNode): # Check if video is provided as base64 or URL if hasattr(video, "bytesBase64Encoded") and video.bytesBase64Encoded: - return IO.NodeOutput(VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded)))) + return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded)))) if hasattr(video, "gcsUri") and video.gcsUri: return IO.NodeOutput(await download_url_to_video_output(video.gcsUri)) @@ -431,8 +429,8 @@ class Veo3FirstLastFrameNode(IO.ComfyNode): aspect_ratio: str, duration: int, seed: int, - first_frame: torch.Tensor, - last_frame: torch.Tensor, + first_frame: Input.Image, + last_frame: Input.Image, model: str, generate_audio: bool, ): @@ -493,7 +491,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode): if response.videos: video = response.videos[0] if video.bytesBase64Encoded: - return IO.NodeOutput(VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded)))) + return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded)))) if video.gcsUri: return IO.NodeOutput(await download_url_to_video_output(video.gcsUri)) raise Exception("Video returned but no data or URL was provided") diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index 6cf6e39bf..c609e03da 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -8,10 +8,7 @@ import json from typing import Optional from typing_extensions import override from fractions import Fraction -from comfy_api.input import AudioInput, ImageInput, VideoInput -from comfy_api.input_impl import VideoFromComponents, VideoFromFile -from comfy_api.util import VideoCodec, VideoComponents, VideoContainer -from comfy_api.latest import ComfyExtension, io, ui +from comfy_api.latest import ComfyExtension, io, ui, Input, InputImpl, Types from comfy.cli_args import args class SaveWEBM(io.ComfyNode): @@ -28,7 +25,6 @@ class SaveWEBM(io.ComfyNode): io.Float.Input("fps", default=24.0, min=0.01, max=1000.0, step=0.01), io.Float.Input("crf", default=32.0, min=0, max=63.0, step=1, tooltip="Higher crf means lower quality with a smaller file size, lower crf means higher quality higher filesize."), ], - outputs=[], hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo], is_output_node=True, ) @@ -79,16 +75,15 @@ class SaveVideo(io.ComfyNode): inputs=[ io.Video.Input("video", tooltip="The video to save."), io.String.Input("filename_prefix", default="video/ComfyUI", tooltip="The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."), - io.Combo.Input("format", options=VideoContainer.as_input(), default="auto", tooltip="The format to save the video as."), - io.Combo.Input("codec", options=VideoCodec.as_input(), default="auto", tooltip="The codec to use for the video."), + io.Combo.Input("format", options=Types.VideoContainer.as_input(), default="auto", tooltip="The format to save the video as."), + io.Combo.Input("codec", options=Types.VideoCodec.as_input(), default="auto", tooltip="The codec to use for the video."), ], - outputs=[], hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo], is_output_node=True, ) @classmethod - def execute(cls, video: VideoInput, filename_prefix, format: str, codec) -> io.NodeOutput: + def execute(cls, video: Input.Video, filename_prefix, format: str, codec) -> io.NodeOutput: width, height = video.get_dimensions() full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path( filename_prefix, @@ -105,10 +100,10 @@ class SaveVideo(io.ComfyNode): metadata["prompt"] = cls.hidden.prompt if len(metadata) > 0: saved_metadata = metadata - file = f"{filename}_{counter:05}_.{VideoContainer.get_extension(format)}" + file = f"{filename}_{counter:05}_.{Types.VideoContainer.get_extension(format)}" video.save_to( os.path.join(full_output_folder, file), - format=VideoContainer(format), + format=Types.VideoContainer(format), codec=codec, metadata=saved_metadata ) @@ -135,9 +130,9 @@ class CreateVideo(io.ComfyNode): ) @classmethod - def execute(cls, images: ImageInput, fps: float, audio: Optional[AudioInput] = None) -> io.NodeOutput: + def execute(cls, images: Input.Image, fps: float, audio: Optional[Input.Audio] = None) -> io.NodeOutput: return io.NodeOutput( - VideoFromComponents(VideoComponents(images=images, audio=audio, frame_rate=Fraction(fps))) + InputImpl.VideoFromComponents(Types.VideoComponents(images=images, audio=audio, frame_rate=Fraction(fps))) ) class GetVideoComponents(io.ComfyNode): @@ -159,11 +154,11 @@ class GetVideoComponents(io.ComfyNode): ) @classmethod - def execute(cls, video: VideoInput) -> io.NodeOutput: + def execute(cls, video: Input.Video) -> io.NodeOutput: components = video.get_components() - return io.NodeOutput(components.images, components.audio, float(components.frame_rate)) + class LoadVideo(io.ComfyNode): @classmethod def define_schema(cls): @@ -185,7 +180,7 @@ class LoadVideo(io.ComfyNode): @classmethod def execute(cls, file) -> io.NodeOutput: video_path = folder_paths.get_annotated_filepath(file) - return io.NodeOutput(VideoFromFile(video_path)) + return io.NodeOutput(InputImpl.VideoFromFile(video_path)) @classmethod def fingerprint_inputs(s, file): From c3c6313fc7b24a5811efde7cfe10b7cbbea52663 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 8 Dec 2025 11:28:17 +0200 Subject: [PATCH 707/759] Added "system_prompt" input to Gemini nodes (#11177) --- comfy_api_nodes/apis/gemini_api.py | 10 +----- comfy_api_nodes/nodes_gemini.py | 52 ++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 11 deletions(-) diff --git a/comfy_api_nodes/apis/gemini_api.py b/comfy_api_nodes/apis/gemini_api.py index a380ecc86..f8edc38c9 100644 --- a/comfy_api_nodes/apis/gemini_api.py +++ b/comfy_api_nodes/apis/gemini_api.py @@ -84,15 +84,7 @@ class GeminiSystemInstructionContent(BaseModel): description="A list of ordered parts that make up a single message. " "Different parts may have different IANA MIME types.", ) - role: GeminiRole = Field( - ..., - description="The identity of the entity that creates the message. " - "The following values are supported: " - "user: This indicates that the message is sent by a real person, typically a user-generated message. " - "model: This indicates that the message is generated by the model. " - "The model value is used to insert messages from model into the conversation during multi-turn conversations. " - "For non-multi-turn conversations, this field can be left blank or unset.", - ) + role: GeminiRole | None = Field(..., description="The role field of systemInstruction may be ignored.") class GeminiFunctionDeclaration(BaseModel): diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 0b7422ef7..ad0f4b4d1 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -26,6 +26,8 @@ from comfy_api_nodes.apis.gemini_api import ( GeminiMimeType, GeminiPart, GeminiRole, + GeminiSystemInstructionContent, + GeminiTextPart, Modality, ) from comfy_api_nodes.util import ( @@ -42,6 +44,14 @@ from comfy_api_nodes.util import ( GEMINI_BASE_ENDPOINT = "/proxy/vertexai/gemini" GEMINI_MAX_INPUT_FILE_SIZE = 20 * 1024 * 1024 # 20 MB +GEMINI_IMAGE_SYS_PROMPT = ( + "You are an expert image-generation engine. You must ALWAYS produce an image.\n" + "Interpret all user input—regardless of " + "format, intent, or abstraction—as literal visual directives for image composition.\n" + "If a prompt is conversational or lacks specific visual details, " + "you must creatively invent a concrete visual scenario that depicts the concept.\n" + "Prioritize generating the visual representation above any text, formatting, or conversational requests." +) class GeminiModel(str, Enum): @@ -276,6 +286,13 @@ class GeminiNode(IO.ComfyNode): tooltip="Optional file(s) to use as context for the model. " "Accepts inputs from the Gemini Generate Content Input Files node.", ), + IO.String.Input( + "system_prompt", + multiline=True, + default="", + optional=True, + tooltip="Foundational instructions that dictate an AI's behavior.", + ), ], outputs=[ IO.String.Output(), @@ -348,6 +365,7 @@ class GeminiNode(IO.ComfyNode): audio: Input.Audio | None = None, video: Input.Video | None = None, files: list[GeminiPart] | None = None, + system_prompt: str = "", ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) @@ -364,7 +382,10 @@ class GeminiNode(IO.ComfyNode): if files is not None: parts.extend(files) - # Create response + gemini_system_prompt = None + if system_prompt: + gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None) + response = await sync_op( cls, endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"), @@ -374,7 +395,8 @@ class GeminiNode(IO.ComfyNode): role=GeminiRole.user, parts=parts, ) - ] + ], + systemInstruction=gemini_system_prompt, ), response_model=GeminiGenerateContentResponse, price_extractor=calculate_tokens_price, @@ -524,6 +546,13 @@ class GeminiImage(IO.ComfyNode): "'IMAGE+TEXT' to return both the generated image and a text response.", optional=True, ), + IO.String.Input( + "system_prompt", + multiline=True, + default=GEMINI_IMAGE_SYS_PROMPT, + optional=True, + tooltip="Foundational instructions that dictate an AI's behavior.", + ), ], outputs=[ IO.Image.Output(), @@ -547,6 +576,7 @@ class GeminiImage(IO.ComfyNode): files: list[GeminiPart] | None = None, aspect_ratio: str = "auto", response_modalities: str = "IMAGE+TEXT", + system_prompt: str = "", ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) parts: list[GeminiPart] = [GeminiPart(text=prompt)] @@ -560,6 +590,10 @@ class GeminiImage(IO.ComfyNode): if files is not None: parts.extend(files) + gemini_system_prompt = None + if system_prompt: + gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None) + response = await sync_op( cls, endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"), @@ -571,6 +605,7 @@ class GeminiImage(IO.ComfyNode): responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]), imageConfig=None if aspect_ratio == "auto" else image_config, ), + systemInstruction=gemini_system_prompt, ), response_model=GeminiGenerateContentResponse, price_extractor=calculate_tokens_price, @@ -641,6 +676,13 @@ class GeminiImage2(IO.ComfyNode): tooltip="Optional file(s) to use as context for the model. " "Accepts inputs from the Gemini Generate Content Input Files node.", ), + IO.String.Input( + "system_prompt", + multiline=True, + default=GEMINI_IMAGE_SYS_PROMPT, + optional=True, + tooltip="Foundational instructions that dictate an AI's behavior.", + ), ], outputs=[ IO.Image.Output(), @@ -665,6 +707,7 @@ class GeminiImage2(IO.ComfyNode): response_modalities: str, images: Input.Image | None = None, files: list[GeminiPart] | None = None, + system_prompt: str = "", ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) @@ -680,6 +723,10 @@ class GeminiImage2(IO.ComfyNode): if aspect_ratio != "auto": image_config.aspectRatio = aspect_ratio + gemini_system_prompt = None + if system_prompt: + gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None) + response = await sync_op( cls, ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"), @@ -691,6 +738,7 @@ class GeminiImage2(IO.ComfyNode): responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]), imageConfig=image_config, ), + systemInstruction=gemini_system_prompt, ), response_model=GeminiGenerateContentResponse, price_extractor=calculate_tokens_price, From fd271dedfde6e192a1f1a025521070876e89e04a Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 8 Dec 2025 11:33:46 +0200 Subject: [PATCH 708/759] [API Nodes] add support for seedance-1-0-pro-fast model (#10947) * feat(api-nodes): add support for seedance-1-0-pro-fast model * feat(api-nodes): add support for seedream-4.5 model --- comfy_api_nodes/apis/bytedance_api.py | 144 +++++++++++++++ comfy_api_nodes/nodes_bytedance.py | 255 ++++++-------------------- 2 files changed, 196 insertions(+), 203 deletions(-) create mode 100644 comfy_api_nodes/apis/bytedance_api.py diff --git a/comfy_api_nodes/apis/bytedance_api.py b/comfy_api_nodes/apis/bytedance_api.py new file mode 100644 index 000000000..77cd76f9b --- /dev/null +++ b/comfy_api_nodes/apis/bytedance_api.py @@ -0,0 +1,144 @@ +from typing import Literal + +from pydantic import BaseModel, Field + + +class Text2ImageTaskCreationRequest(BaseModel): + model: str = Field(...) + prompt: str = Field(...) + response_format: str | None = Field("url") + size: str | None = Field(None) + seed: int | None = Field(0, ge=0, le=2147483647) + guidance_scale: float | None = Field(..., ge=1.0, le=10.0) + watermark: bool | None = Field(True) + + +class Image2ImageTaskCreationRequest(BaseModel): + model: str = Field(...) + prompt: str = Field(...) + response_format: str | None = Field("url") + image: str = Field(..., description="Base64 encoded string or image URL") + size: str | None = Field("adaptive") + seed: int | None = Field(..., ge=0, le=2147483647) + guidance_scale: float | None = Field(..., ge=1.0, le=10.0) + watermark: bool | None = Field(True) + + +class Seedream4Options(BaseModel): + max_images: int = Field(15) + + +class Seedream4TaskCreationRequest(BaseModel): + model: str = Field(...) + prompt: str = Field(...) + response_format: str = Field("url") + image: list[str] | None = Field(None, description="Image URLs") + size: str = Field(...) + seed: int = Field(..., ge=0, le=2147483647) + sequential_image_generation: str = Field("disabled") + sequential_image_generation_options: Seedream4Options = Field(Seedream4Options(max_images=15)) + watermark: bool = Field(True) + + +class ImageTaskCreationResponse(BaseModel): + model: str = Field(...) + created: int = Field(..., description="Unix timestamp (in seconds) indicating time when the request was created.") + data: list = Field([], description="Contains information about the generated image(s).") + error: dict = Field({}, description="Contains `code` and `message` fields in case of error.") + + +class TaskTextContent(BaseModel): + type: str = Field("text") + text: str = Field(...) + + +class TaskImageContentUrl(BaseModel): + url: str = Field(...) + + +class TaskImageContent(BaseModel): + type: str = Field("image_url") + image_url: TaskImageContentUrl = Field(...) + role: Literal["first_frame", "last_frame", "reference_image"] | None = Field(None) + + +class Text2VideoTaskCreationRequest(BaseModel): + model: str = Field(...) + content: list[TaskTextContent] = Field(..., min_length=1) + + +class Image2VideoTaskCreationRequest(BaseModel): + model: str = Field(...) + content: list[TaskTextContent | TaskImageContent] = Field(..., min_length=2) + + +class TaskCreationResponse(BaseModel): + id: str = Field(...) + + +class TaskStatusError(BaseModel): + code: str = Field(...) + message: str = Field(...) + + +class TaskStatusResult(BaseModel): + video_url: str = Field(...) + + +class TaskStatusResponse(BaseModel): + id: str = Field(...) + model: str = Field(...) + status: Literal["queued", "running", "cancelled", "succeeded", "failed"] = Field(...) + error: TaskStatusError | None = Field(None) + content: TaskStatusResult | None = Field(None) + + +RECOMMENDED_PRESETS = [ + ("1024x1024 (1:1)", 1024, 1024), + ("864x1152 (3:4)", 864, 1152), + ("1152x864 (4:3)", 1152, 864), + ("1280x720 (16:9)", 1280, 720), + ("720x1280 (9:16)", 720, 1280), + ("832x1248 (2:3)", 832, 1248), + ("1248x832 (3:2)", 1248, 832), + ("1512x648 (21:9)", 1512, 648), + ("2048x2048 (1:1)", 2048, 2048), + ("Custom", None, None), +] + +RECOMMENDED_PRESETS_SEEDREAM_4 = [ + ("2048x2048 (1:1)", 2048, 2048), + ("2304x1728 (4:3)", 2304, 1728), + ("1728x2304 (3:4)", 1728, 2304), + ("2560x1440 (16:9)", 2560, 1440), + ("1440x2560 (9:16)", 1440, 2560), + ("2496x1664 (3:2)", 2496, 1664), + ("1664x2496 (2:3)", 1664, 2496), + ("3024x1296 (21:9)", 3024, 1296), + ("4096x4096 (1:1)", 4096, 4096), + ("Custom", None, None), +] + +# The time in this dictionary are given for 10 seconds duration. +VIDEO_TASKS_EXECUTION_TIME = { + "seedance-1-0-lite-t2v-250428": { + "480p": 40, + "720p": 60, + "1080p": 90, + }, + "seedance-1-0-lite-i2v-250428": { + "480p": 40, + "720p": 60, + "1080p": 90, + }, + "seedance-1-0-pro-250528": { + "480p": 70, + "720p": 85, + "1080p": 115, + }, + "seedance-1-0-pro-fast-251015": { + "480p": 50, + "720p": 65, + "1080p": 100, + }, +} diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index caced471e..57c0218d0 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -1,13 +1,27 @@ import logging import math -from enum import Enum -from typing import Literal, Optional, Union import torch -from pydantic import BaseModel, Field from typing_extensions import override -from comfy_api.latest import IO, ComfyExtension +from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api_nodes.apis.bytedance_api import ( + RECOMMENDED_PRESETS, + RECOMMENDED_PRESETS_SEEDREAM_4, + VIDEO_TASKS_EXECUTION_TIME, + Image2ImageTaskCreationRequest, + Image2VideoTaskCreationRequest, + ImageTaskCreationResponse, + Seedream4Options, + Seedream4TaskCreationRequest, + TaskCreationResponse, + TaskImageContent, + TaskImageContentUrl, + TaskStatusResponse, + TaskTextContent, + Text2ImageTaskCreationRequest, + Text2VideoTaskCreationRequest, +) from comfy_api_nodes.util import ( ApiEndpoint, download_url_to_image_tensor, @@ -29,162 +43,6 @@ BYTEPLUS_TASK_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks" BYTEPLUS_TASK_STATUS_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks" # + /{task_id} -class Text2ImageModelName(str, Enum): - seedream_3 = "seedream-3-0-t2i-250415" - - -class Image2ImageModelName(str, Enum): - seededit_3 = "seededit-3-0-i2i-250628" - - -class Text2VideoModelName(str, Enum): - seedance_1_pro = "seedance-1-0-pro-250528" - seedance_1_lite = "seedance-1-0-lite-t2v-250428" - - -class Image2VideoModelName(str, Enum): - """note(August 31): Pro model only supports FirstFrame: https://docs.byteplus.com/en/docs/ModelArk/1520757""" - - seedance_1_pro = "seedance-1-0-pro-250528" - seedance_1_lite = "seedance-1-0-lite-i2v-250428" - - -class Text2ImageTaskCreationRequest(BaseModel): - model: Text2ImageModelName = Text2ImageModelName.seedream_3 - prompt: str = Field(...) - response_format: Optional[str] = Field("url") - size: Optional[str] = Field(None) - seed: Optional[int] = Field(0, ge=0, le=2147483647) - guidance_scale: Optional[float] = Field(..., ge=1.0, le=10.0) - watermark: Optional[bool] = Field(True) - - -class Image2ImageTaskCreationRequest(BaseModel): - model: Image2ImageModelName = Image2ImageModelName.seededit_3 - prompt: str = Field(...) - response_format: Optional[str] = Field("url") - image: str = Field(..., description="Base64 encoded string or image URL") - size: Optional[str] = Field("adaptive") - seed: Optional[int] = Field(..., ge=0, le=2147483647) - guidance_scale: Optional[float] = Field(..., ge=1.0, le=10.0) - watermark: Optional[bool] = Field(True) - - -class Seedream4Options(BaseModel): - max_images: int = Field(15) - - -class Seedream4TaskCreationRequest(BaseModel): - model: str = Field("seedream-4-0-250828") - prompt: str = Field(...) - response_format: str = Field("url") - image: Optional[list[str]] = Field(None, description="Image URLs") - size: str = Field(...) - seed: int = Field(..., ge=0, le=2147483647) - sequential_image_generation: str = Field("disabled") - sequential_image_generation_options: Seedream4Options = Field(Seedream4Options(max_images=15)) - watermark: bool = Field(True) - - -class ImageTaskCreationResponse(BaseModel): - model: str = Field(...) - created: int = Field(..., description="Unix timestamp (in seconds) indicating time when the request was created.") - data: list = Field([], description="Contains information about the generated image(s).") - error: dict = Field({}, description="Contains `code` and `message` fields in case of error.") - - -class TaskTextContent(BaseModel): - type: str = Field("text") - text: str = Field(...) - - -class TaskImageContentUrl(BaseModel): - url: str = Field(...) - - -class TaskImageContent(BaseModel): - type: str = Field("image_url") - image_url: TaskImageContentUrl = Field(...) - role: Optional[Literal["first_frame", "last_frame", "reference_image"]] = Field(None) - - -class Text2VideoTaskCreationRequest(BaseModel): - model: Text2VideoModelName = Text2VideoModelName.seedance_1_pro - content: list[TaskTextContent] = Field(..., min_length=1) - - -class Image2VideoTaskCreationRequest(BaseModel): - model: Image2VideoModelName = Image2VideoModelName.seedance_1_pro - content: list[Union[TaskTextContent, TaskImageContent]] = Field(..., min_length=2) - - -class TaskCreationResponse(BaseModel): - id: str = Field(...) - - -class TaskStatusError(BaseModel): - code: str = Field(...) - message: str = Field(...) - - -class TaskStatusResult(BaseModel): - video_url: str = Field(...) - - -class TaskStatusResponse(BaseModel): - id: str = Field(...) - model: str = Field(...) - status: Literal["queued", "running", "cancelled", "succeeded", "failed"] = Field(...) - error: Optional[TaskStatusError] = Field(None) - content: Optional[TaskStatusResult] = Field(None) - - -RECOMMENDED_PRESETS = [ - ("1024x1024 (1:1)", 1024, 1024), - ("864x1152 (3:4)", 864, 1152), - ("1152x864 (4:3)", 1152, 864), - ("1280x720 (16:9)", 1280, 720), - ("720x1280 (9:16)", 720, 1280), - ("832x1248 (2:3)", 832, 1248), - ("1248x832 (3:2)", 1248, 832), - ("1512x648 (21:9)", 1512, 648), - ("2048x2048 (1:1)", 2048, 2048), - ("Custom", None, None), -] - -RECOMMENDED_PRESETS_SEEDREAM_4 = [ - ("2048x2048 (1:1)", 2048, 2048), - ("2304x1728 (4:3)", 2304, 1728), - ("1728x2304 (3:4)", 1728, 2304), - ("2560x1440 (16:9)", 2560, 1440), - ("1440x2560 (9:16)", 1440, 2560), - ("2496x1664 (3:2)", 2496, 1664), - ("1664x2496 (2:3)", 1664, 2496), - ("3024x1296 (21:9)", 3024, 1296), - ("4096x4096 (1:1)", 4096, 4096), - ("Custom", None, None), -] - -# The time in this dictionary are given for 10 seconds duration. -VIDEO_TASKS_EXECUTION_TIME = { - "seedance-1-0-lite-t2v-250428": { - "480p": 40, - "720p": 60, - "1080p": 90, - }, - "seedance-1-0-lite-i2v-250428": { - "480p": 40, - "720p": 60, - "1080p": 90, - }, - "seedance-1-0-pro-250528": { - "480p": 70, - "720p": 85, - "1080p": 115, - }, -} - - def get_image_url_from_response(response: ImageTaskCreationResponse) -> str: if response.error: error_msg = f"ByteDance request failed. Code: {response.error['code']}, message: {response.error['message']}" @@ -194,13 +52,6 @@ def get_image_url_from_response(response: ImageTaskCreationResponse) -> str: return response.data[0]["url"] -def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]: - """Returns the video URL from the task status response if it exists.""" - if hasattr(response, "content") and response.content: - return response.content.video_url - return None - - class ByteDanceImageNode(IO.ComfyNode): @classmethod @@ -211,12 +62,7 @@ class ByteDanceImageNode(IO.ComfyNode): category="api node/image/ByteDance", description="Generate images using ByteDance models via api based on prompt", inputs=[ - IO.Combo.Input( - "model", - options=Text2ImageModelName, - default=Text2ImageModelName.seedream_3, - tooltip="Model name", - ), + IO.Combo.Input("model", options=["seedream-3-0-t2i-250415"]), IO.String.Input( "prompt", multiline=True, @@ -335,12 +181,7 @@ class ByteDanceImageEditNode(IO.ComfyNode): category="api node/image/ByteDance", description="Edit images using ByteDance models via api based on prompt", inputs=[ - IO.Combo.Input( - "model", - options=Image2ImageModelName, - default=Image2ImageModelName.seededit_3, - tooltip="Model name", - ), + IO.Combo.Input("model", options=["seededit-3-0-i2i-250628"]), IO.Image.Input( "image", tooltip="The base image to edit", @@ -394,7 +235,7 @@ class ByteDanceImageEditNode(IO.ComfyNode): async def execute( cls, model: str, - image: torch.Tensor, + image: Input.Image, prompt: str, seed: int, guidance_scale: float, @@ -434,7 +275,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode): inputs=[ IO.Combo.Input( "model", - options=["seedream-4-0-250828"], + options=["seedream-4-5-251128", "seedream-4-0-250828"], tooltip="Model name", ), IO.String.Input( @@ -459,7 +300,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode): default=2048, min=1024, max=4096, - step=64, + step=8, tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`", optional=True, ), @@ -468,7 +309,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode): default=2048, min=1024, max=4096, - step=64, + step=8, tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`", optional=True, ), @@ -532,7 +373,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode): cls, model: str, prompt: str, - image: torch.Tensor = None, + image: Input.Image | None = None, size_preset: str = RECOMMENDED_PRESETS_SEEDREAM_4[0][0], width: int = 2048, height: int = 2048, @@ -555,6 +396,18 @@ class ByteDanceSeedreamNode(IO.ComfyNode): raise ValueError( f"Custom size out of range: {w}x{h}. " "Both width and height must be between 1024 and 4096 pixels." ) + out_num_pixels = w * h + mp_provided = out_num_pixels / 1_000_000.0 + if "seedream-4-5" in model and out_num_pixels < 3686400: + raise ValueError( + f"Minimum image resolution that Seedream 4.5 can generate is 3.68MP, " + f"but {mp_provided:.2f}MP provided." + ) + if "seedream-4-0" in model and out_num_pixels < 921600: + raise ValueError( + f"Minimum image resolution that the selected model can generate is 0.92MP, " + f"but {mp_provided:.2f}MP provided." + ) n_input_images = get_number_of_images(image) if image is not None else 0 if n_input_images > 10: raise ValueError(f"Maximum of 10 reference images are supported, but {n_input_images} received.") @@ -607,9 +460,8 @@ class ByteDanceTextToVideoNode(IO.ComfyNode): inputs=[ IO.Combo.Input( "model", - options=Text2VideoModelName, - default=Text2VideoModelName.seedance_1_pro, - tooltip="Model name", + options=["seedance-1-0-pro-250528", "seedance-1-0-lite-t2v-250428", "seedance-1-0-pro-fast-251015"], + default="seedance-1-0-pro-fast-251015", ), IO.String.Input( "prompt", @@ -714,9 +566,8 @@ class ByteDanceImageToVideoNode(IO.ComfyNode): inputs=[ IO.Combo.Input( "model", - options=Image2VideoModelName, - default=Image2VideoModelName.seedance_1_pro, - tooltip="Model name", + options=["seedance-1-0-pro-250528", "seedance-1-0-lite-t2v-250428", "seedance-1-0-pro-fast-251015"], + default="seedance-1-0-pro-fast-251015", ), IO.String.Input( "prompt", @@ -787,7 +638,7 @@ class ByteDanceImageToVideoNode(IO.ComfyNode): cls, model: str, prompt: str, - image: torch.Tensor, + image: Input.Image, resolution: str, aspect_ratio: str, duration: int, @@ -833,9 +684,8 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode): inputs=[ IO.Combo.Input( "model", - options=[model.value for model in Image2VideoModelName], - default=Image2VideoModelName.seedance_1_lite.value, - tooltip="Model name", + options=["seedance-1-0-pro-250528", "seedance-1-0-lite-i2v-250428"], + default="seedance-1-0-lite-i2v-250428", ), IO.String.Input( "prompt", @@ -910,8 +760,8 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode): cls, model: str, prompt: str, - first_frame: torch.Tensor, - last_frame: torch.Tensor, + first_frame: Input.Image, + last_frame: Input.Image, resolution: str, aspect_ratio: str, duration: int, @@ -968,9 +818,8 @@ class ByteDanceImageReferenceNode(IO.ComfyNode): inputs=[ IO.Combo.Input( "model", - options=[Image2VideoModelName.seedance_1_lite.value], - default=Image2VideoModelName.seedance_1_lite.value, - tooltip="Model name", + options=["seedance-1-0-pro-250528", "seedance-1-0-lite-i2v-250428"], + default="seedance-1-0-lite-i2v-250428", ), IO.String.Input( "prompt", @@ -1034,7 +883,7 @@ class ByteDanceImageReferenceNode(IO.ComfyNode): cls, model: str, prompt: str, - images: torch.Tensor, + images: Input.Image, resolution: str, aspect_ratio: str, duration: int, @@ -1069,8 +918,8 @@ class ByteDanceImageReferenceNode(IO.ComfyNode): async def process_video_task( cls: type[IO.ComfyNode], - payload: Union[Text2VideoTaskCreationRequest, Image2VideoTaskCreationRequest], - estimated_duration: Optional[int], + payload: Text2VideoTaskCreationRequest | Image2VideoTaskCreationRequest, + estimated_duration: int | None, ) -> IO.NodeOutput: initial_response = await sync_op( cls, @@ -1085,7 +934,7 @@ async def process_video_task( estimated_duration=estimated_duration, response_model=TaskStatusResponse, ) - return IO.NodeOutput(await download_url_to_video_output(get_video_url_from_task_status(response))) + return IO.NodeOutput(await download_url_to_video_output(response.content.video_url)) def raise_if_text_params(prompt: str, text_params: list[str]) -> None: From 8e889c535d1fc407bf27dbf8359eef9580f2ed60 Mon Sep 17 00:00:00 2001 From: dxqb <183307934+dxqb@users.noreply.github.com> Date: Mon, 8 Dec 2025 21:17:26 +0100 Subject: [PATCH 709/759] Support "transformer." LoRA prefix for Z-Image (#11135) --- comfy/lora.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/lora.py b/comfy/lora.py index e7202ce97..2ed0acb9d 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -320,6 +320,7 @@ def model_lora_keys_unet(model, key_map={}): to = diffusers_keys[k] key_lora = k[:-len(".weight")] key_map["diffusion_model.{}".format(key_lora)] = to + key_map["transformer.{}".format(key_lora)] = to key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = to if isinstance(model, comfy.model_base.Kandinsky5): From 60ee574748209a17ade1c7524e228be2802d1589 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Tue, 9 Dec 2025 06:18:06 +1000 Subject: [PATCH 710/759] retune lowVramPatch VRAM accounting (#11173) In the lowvram case, this now does its math in the model dtype in the post de-quantization domain. Account for that. The patching was also put back on the compute stream getting it off-peak so relax the MATH_FACTOR to only x2 so get out of the worst-case assumption of everything peaking at once. --- comfy/model_patcher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 5b1ccb824..8b5edeb52 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -132,14 +132,14 @@ class LowVramPatch: def __call__(self, weight): return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=weight.dtype) -#The above patch logic may cast up the weight to fp32, and do math. Go with fp32 x 3 -LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR = 3 +LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR = 2 def low_vram_patch_estimate_vram(model, key): weight, set_func, convert_func = get_key_weight(model, key) if weight is None: return 0 - return weight.numel() * torch.float32.itemsize * LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR + model_dtype = getattr(model, "manual_cast_dtype", torch.float32) + return weight.numel() * model_dtype.itemsize * LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR def get_key_weight(model, key): set_func = None From 935493f6c186de8808508713a465d6bda75e5ce4 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 9 Dec 2025 04:18:53 +0800 Subject: [PATCH 711/759] chore: update workflow templates to v0.7.54 (#11192) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 12a7c1089..4bd4b21c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.33.10 -comfyui-workflow-templates==0.7.51 +comfyui-workflow-templates==0.7.54 comfyui-embedded-docs==0.3.1 torch torchsde From 3b0368aa34182fc7c97de92d59b609c77138def2 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 8 Dec 2025 14:38:36 -0800 Subject: [PATCH 712/759] Fix regression. (#11194) --- comfy/model_patcher.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 8b5edeb52..a7d24ac13 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -139,6 +139,9 @@ def low_vram_patch_estimate_vram(model, key): if weight is None: return 0 model_dtype = getattr(model, "manual_cast_dtype", torch.float32) + if model_dtype is None: + model_dtype = weight.dtype + return weight.numel() * model_dtype.itemsize * LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR def get_key_weight(model, key): From d50f342c90802830c1178ad9d7f2783dc2821af1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 8 Dec 2025 20:20:04 -0800 Subject: [PATCH 713/759] Fix potential issue. (#11201) --- comfy/model_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index a7d24ac13..2e8ce2613 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -923,7 +923,7 @@ class ModelPatcher: patch_counter += 1 cast_weight = True - if cast_weight: + if cast_weight and hasattr(m, "comfy_cast_weights"): m.prev_comfy_cast_weights = m.comfy_cast_weights m.comfy_cast_weights = True m.comfy_patched_weights = False From e136b6dbb0b08341388f5bf9a00b1fca29992eb3 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Tue, 9 Dec 2025 14:21:31 +1000 Subject: [PATCH 714/759] dequantization offload accounting (fixes Flux2 OOMs - incl TEs) (#11171) * make setattr safe for non existent attributes Handle the case where the attribute doesnt exist by returning a static sentinel (distinct from None). If the sentinel is passed in as the set value, del the attr. * Account for dequantization and type-casts in offload costs When measuring the cost of offload, identify weights that need a type change or dequantization and add the size of the conversion result to the offload cost. This is mutually exclusive with lowvram patches which already has a large conservative estimate and wont overlap the dequant cost so\ dont double count. * Set the compute type on CLIP MPs So that the loader can know the size of weights for dequant accounting. --- comfy/model_patcher.py | 19 +++++++++++++------ comfy/sd.py | 2 ++ comfy/utils.py | 9 +++++++-- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 2e8ce2613..a486c2723 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -35,6 +35,7 @@ import comfy.model_management import comfy.patcher_extension import comfy.utils from comfy.comfy_types import UnetWrapperFunction +from comfy.quant_ops import QuantizedTensor from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP @@ -665,12 +666,18 @@ class ModelPatcher: module_mem = comfy.model_management.module_size(m) module_offload_mem = module_mem if hasattr(m, "comfy_cast_weights"): - weight_key = "{}.weight".format(n) - bias_key = "{}.bias".format(n) - if weight_key in self.patches: - module_offload_mem += low_vram_patch_estimate_vram(self.model, weight_key) - if bias_key in self.patches: - module_offload_mem += low_vram_patch_estimate_vram(self.model, bias_key) + def check_module_offload_mem(key): + if key in self.patches: + return low_vram_patch_estimate_vram(self.model, key) + model_dtype = getattr(self.model, "manual_cast_dtype", None) + weight, _, _ = get_key_weight(self.model, key) + if model_dtype is None or weight is None: + return 0 + if (weight.dtype != model_dtype or isinstance(weight, QuantizedTensor)): + return weight.numel() * model_dtype.itemsize + return 0 + module_offload_mem += check_module_offload_mem("{}.weight".format(n)) + module_offload_mem += check_module_offload_mem("{}.bias".format(n)) loading.append((module_offload_mem, module_mem, n, m, params)) return loading diff --git a/comfy/sd.py b/comfy/sd.py index 754b1703d..a16f2d14f 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -127,6 +127,8 @@ class CLIP: self.tokenizer = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data) self.patcher = comfy.model_patcher.ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device) + #Match torch.float32 hardcode upcast in TE implemention + self.patcher.set_model_compute_dtype(torch.float32) self.patcher.hook_mode = comfy.hooks.EnumHookMode.MinVram self.patcher.is_clip = True self.apply_hooks_to_conds = None diff --git a/comfy/utils.py b/comfy/utils.py index 89846bc95..9dc0d76ac 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -803,12 +803,17 @@ def safetensors_header(safetensors_path, max_size=100*1024*1024): return None return f.read(length_of_header) +ATTR_UNSET={} + def set_attr(obj, attr, value): attrs = attr.split(".") for name in attrs[:-1]: obj = getattr(obj, name) - prev = getattr(obj, attrs[-1]) - setattr(obj, attrs[-1], value) + prev = getattr(obj, attrs[-1], ATTR_UNSET) + if value is ATTR_UNSET: + delattr(obj, attrs[-1]) + else: + setattr(obj, attrs[-1], value) return prev def set_attr_param(obj, attr, value): From cabc4d351ff620ece87f18019d98131ebcbdf1aa Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Mon, 8 Dec 2025 20:22:02 -0800 Subject: [PATCH 715/759] bump comfyui-frontend-package to 1.33.13 (patch) (#11200) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4bd4b21c3..11a7ac245 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.33.10 +comfyui-frontend-package==1.33.13 comfyui-workflow-templates==0.7.54 comfyui-embedded-docs==0.3.1 torch From b9fb542703085c58f082b4a822329fb6670e8016 Mon Sep 17 00:00:00 2001 From: Lodestone Date: Tue, 9 Dec 2025 11:33:29 +0700 Subject: [PATCH 716/759] add chroma-radiance-x0 mode (#11197) --- comfy/ldm/chroma_radiance/model.py | 20 ++++++++++++++++++-- comfy/model_detection.py | 2 ++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py index e643b4414..70d173889 100644 --- a/comfy/ldm/chroma_radiance/model.py +++ b/comfy/ldm/chroma_radiance/model.py @@ -37,7 +37,7 @@ class ChromaRadianceParams(ChromaParams): nerf_final_head_type: str # None means use the same dtype as the model. nerf_embedder_dtype: Optional[torch.dtype] - + use_x0: bool class ChromaRadiance(Chroma): """ @@ -159,6 +159,9 @@ class ChromaRadiance(Chroma): self.skip_dit = [] self.lite = False + if params.use_x0: + self.register_buffer("__x0__", torch.tensor([])) + @property def _nerf_final_layer(self) -> nn.Module: if self.params.nerf_final_head_type == "linear": @@ -276,6 +279,12 @@ class ChromaRadiance(Chroma): params_dict |= overrides return params.__class__(**params_dict) + def _apply_x0_residual(self, predicted, noisy, timesteps): + + # non zero during training to prevent 0 div + eps = 0.0 + return (noisy - predicted) / (timesteps.view(-1,1,1,1) + eps) + def _forward( self, x: Tensor, @@ -316,4 +325,11 @@ class ChromaRadiance(Chroma): transformer_options, attn_mask=kwargs.get("attention_mask", None), ) - return self.forward_nerf(img, img_out, params)[:, :, :h, :w] + + out = self.forward_nerf(img, img_out, params)[:, :, :h, :w] + + # If x0 variant → v-pred, just return this instead + if hasattr(self, "__x0__"): + out = self._apply_x0_residual(out, img, timestep) + return out + diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 74c547427..19e6aa954 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -257,6 +257,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["nerf_tile_size"] = 512 dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear" dit_config["nerf_embedder_dtype"] = torch.float32 + if "__x0__" in state_dict_keys: # x0 pred + dit_config["use_x0"] = True else: dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys From 9d252f3b70c0e89cbb581e28bb1862593c4e5ceb Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Tue, 9 Dec 2025 15:55:13 +1000 Subject: [PATCH 717/759] ops: delete dead code (#11204) This became dead code in https://github.com/comfyanonymous/ComfyUI/pull/11069 --- comfy/ops.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 35237c9f7..6f34d50fc 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -22,7 +22,6 @@ import comfy.model_management from comfy.cli_args import args, PerformanceFeature import comfy.float import comfy.rmsnorm -import contextlib import json def run_every_op(): @@ -94,13 +93,6 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of else: offload_stream = None - if offload_stream is not None: - wf_context = offload_stream - if hasattr(wf_context, "as_context"): - wf_context = wf_context.as_context(offload_stream) - else: - wf_context = contextlib.nullcontext() - non_blocking = comfy.model_management.device_supports_non_blocking(device) weight_has_function = len(s.weight_function) > 0 From e2a800e7ef225260c078ce484c75bb40161d9d94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Tue, 9 Dec 2025 23:59:16 +0200 Subject: [PATCH 718/759] Fix for HunyuanVideo1.5 meanflow distil (#11212) --- comfy/ldm/hunyuan_video/model.py | 3 ++- comfy/model_detection.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py index 2749c53f5..55ab550f8 100644 --- a/comfy/ldm/hunyuan_video/model.py +++ b/comfy/ldm/hunyuan_video/model.py @@ -43,6 +43,7 @@ class HunyuanVideoParams: meanflow: bool use_cond_type_embedding: bool vision_in_dim: int + meanflow_sum: bool class SelfAttentionRef(nn.Module): @@ -317,7 +318,7 @@ class HunyuanVideo(nn.Module): timesteps_r = transformer_options['sample_sigmas'][w[0] + 1] timesteps_r = timesteps_r.unsqueeze(0).to(device=timesteps.device, dtype=timesteps.dtype) vec_r = self.time_r_in(timestep_embedding(timesteps_r, 256, time_factor=1000.0).to(img.dtype)) - vec = (vec + vec_r) / 2 + vec = (vec + vec_r) if self.params.meanflow_sum else (vec + vec_r) / 2 if ref_latent is not None: ref_latent_ids = self.img_ids(ref_latent) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 19e6aa954..1f5d34bdd 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -180,8 +180,10 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["use_cond_type_embedding"] = False if '{}vision_in.proj.0.weight'.format(key_prefix) in state_dict_keys: dit_config["vision_in_dim"] = state_dict['{}vision_in.proj.0.weight'.format(key_prefix)].shape[0] + dit_config["meanflow_sum"] = True else: dit_config["vision_in_dim"] = None + dit_config["meanflow_sum"] = False return dit_config if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight) From 791e30ff5037fa5e7aa4e1396099ea8d6bfb020b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 9 Dec 2025 14:03:21 -0800 Subject: [PATCH 719/759] Fix nan issue when quantizing fp16 tensor. (#11213) --- comfy/quant_ops.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index 571d3f760..cd96541d7 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -399,7 +399,10 @@ class TensorCoreFP8Layout(QuantizedLayout): orig_dtype = tensor.dtype if isinstance(scale, str) and scale == "recalculate": - scale = torch.amax(tensor.abs()) / torch.finfo(dtype).max + scale = torch.amax(tensor.abs()).to(dtype=torch.float32) / torch.finfo(dtype).max + if tensor.dtype not in [torch.float32, torch.bfloat16]: # Prevent scale from being too small + tensor_info = torch.finfo(tensor.dtype) + scale = (1.0 / torch.clamp((1.0 / scale), min=tensor_info.min, max=tensor_info.max)) if scale is not None: if not isinstance(scale, torch.Tensor): From fc657f471a29d07696ca16b566000e8e555d67d1 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 9 Dec 2025 18:22:09 -0500 Subject: [PATCH 720/759] ComfyUI version v0.4.0 From now on ComfyUI will do version numbers a bit differently, every stable off the master branch will increment the minor version. Anytime a fix needs to be backported onto a stable version the patch version will be incremented. Example: We release v0.6.0 off the master branch then a day later a bug is discovered and we decide to backport the fix onto the v0.6.0 stable, this will be done in a separate branch in the main repository and this new stable will be tagged v0.6.1 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 4b039356e..2f083edaf 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.76" +__version__ = "0.4.0" diff --git a/pyproject.toml b/pyproject.toml index 02b94a0ce..e4d3d616a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.76" +version = "0.4.0" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From f668c2e3c99df40561b416cf62b0fd9eec96007a Mon Sep 17 00:00:00 2001 From: Benjamin Lu Date: Tue, 9 Dec 2025 19:27:07 -0800 Subject: [PATCH 721/759] bump comfyui-frontend-package to 1.34.8 (#11220) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 11a7ac245..9e9b25328 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.33.13 +comfyui-frontend-package==1.34.8 comfyui-workflow-templates==0.7.54 comfyui-embedded-docs==0.3.1 torch From 36357bbcc3c515e37a742457a2b2ab4b7ccc17a8 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 10 Dec 2025 21:55:09 +0200 Subject: [PATCH 722/759] process the NodeV1 dict results correctly (#11237) --- comfy_api/latest/_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 313a5af20..79217c813 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -1815,7 +1815,7 @@ class NodeOutput(_NodeOutputInternal): ui = data["ui"] if "expand" in data: expand = data["expand"] - return cls(args=args, ui=ui, expand=expand) + return cls(*args, ui=ui, expand=expand) def __getitem__(self, index) -> Any: return self.args[index] From 17c92a9f2843d7b9b727531066be2378b350a6ae Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 10 Dec 2025 16:59:48 -0800 Subject: [PATCH 723/759] Tweak Z Image memory estimation. (#11254) --- comfy/supported_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 383c82c3e..dd0f09f32 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1026,7 +1026,7 @@ class ZImage(Lumina2): "shift": 3.0, } - memory_usage_factor = 1.7 + memory_usage_factor = 2.0 supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] From 57ddb7fd13d817e7259c2c992a852832b6b0f07a Mon Sep 17 00:00:00 2001 From: Johnpaul Chiwetelu <49923152+Myestery@users.noreply.github.com> Date: Thu, 11 Dec 2025 03:49:49 +0100 Subject: [PATCH 724/759] Fix: filter hidden files from /internal/files endpoint (#11191) --- api_server/routes/internal/internal_routes.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/api_server/routes/internal/internal_routes.py b/api_server/routes/internal/internal_routes.py index 613b0f7c7..b224306da 100644 --- a/api_server/routes/internal/internal_routes.py +++ b/api_server/routes/internal/internal_routes.py @@ -58,8 +58,13 @@ class InternalRoutes: return web.json_response({"error": "Invalid directory type"}, status=400) directory = get_directory_by_type(directory_type) + + def is_visible_file(entry: os.DirEntry) -> bool: + """Filter out hidden files (e.g., .DS_Store on macOS).""" + return entry.is_file() and not entry.name.startswith('.') + sorted_files = sorted( - (entry for entry in os.scandir(directory) if entry.is_file()), + (entry for entry in os.scandir(directory) if is_visible_file(entry)), key=lambda entry: -entry.stat().st_mtime ) return web.json_response([entry.name for entry in sorted_files], status=200) From e711aaf1a75120195c56ebd1f1ce829c6b7b84db Mon Sep 17 00:00:00 2001 From: Farshore <168402472+jiangchengchengark@users.noreply.github.com> Date: Thu, 11 Dec 2025 11:02:26 +0800 Subject: [PATCH 725/759] =?UTF-8?q?Lower=20VAE=20loading=20requirements?= =?UTF-8?q?=EF=BC=9ACreate=20a=20new=20branch=20for=20GPU=20memory=20calcu?= =?UTF-8?q?lations=20in=20qwen-image=20vae=20(#11199)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- comfy/sd.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index a16f2d14f..1cad98aef 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -549,8 +549,10 @@ class VAE: ddconfig = {"dim": dim, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0} self.first_stage_model = comfy.ldm.wan.vae.WanVAE(**ddconfig) self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] - self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype) - self.memory_used_decode = lambda shape, dtype: 7000 * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype) + self.memory_used_encode = lambda shape, dtype: (1500 if shape[2]<=4 else 6000) * shape[3] * shape[4] * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (2200 if shape[2]<=4 else 7000) * shape[3] * shape[4] * (8*8) * model_management.dtype_size(dtype) + + # Hunyuan 3d v2 2.0 & 2.1 elif "geo_decoder.cross_attn_decoder.ln_1.bias" in sd: From 93948e3fc598c14082f744fe82fae056b64ff481 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 11 Dec 2025 08:11:12 +0200 Subject: [PATCH 726/759] feat(api-nodes): enable Kling Omni O1 node (#11229) --- comfy_api_nodes/nodes_kling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 6c840dc47..a2cc87d84 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -2056,7 +2056,7 @@ class KlingExtension(ComfyExtension): OmniProImageToVideoNode, OmniProVideoToVideoNode, OmniProEditVideoNode, - # OmniProImageNode, # need support from backend + OmniProImageNode, ] From f8321eb57b29a4b34cecd27d5d6365adf5e6e601 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 10 Dec 2025 22:30:31 -0800 Subject: [PATCH 727/759] Adjust memory usage factor. (#11257) --- comfy/supported_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index dd0f09f32..ef8c75c09 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -541,7 +541,7 @@ class SD3(supported_models_base.BASE): unet_extra_config = {} latent_format = latent_formats.SD3 - memory_usage_factor = 1.2 + memory_usage_factor = 1.6 text_encoder_key_prefix = ["text_encoders."] From fdebe182966d1dd9bee3138264937137bd2302d8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 11 Dec 2025 14:09:35 -0800 Subject: [PATCH 728/759] Fix regular chroma radiance (#11276) --- comfy/model_detection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 1f5d34bdd..94b54b7c2 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -261,6 +261,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["nerf_embedder_dtype"] = torch.float32 if "__x0__" in state_dict_keys: # x0 pred dit_config["use_x0"] = True + else: + dit_config["use_x0"] = False else: dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys From ae65433a602470eea271df47af0eb871d146a002 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 11 Dec 2025 14:15:00 -0800 Subject: [PATCH 729/759] This only works on radiance. (#11277) --- comfy/model_detection.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 94b54b7c2..dd6a703f6 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -259,10 +259,10 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["nerf_tile_size"] = 512 dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear" dit_config["nerf_embedder_dtype"] = torch.float32 - if "__x0__" in state_dict_keys: # x0 pred - dit_config["use_x0"] = True - else: - dit_config["use_x0"] = False + if "__x0__" in state_dict_keys: # x0 pred + dit_config["use_x0"] = True + else: + dit_config["use_x0"] = False else: dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys From eeb020b9b77e1f3c0c2806bc1e38c7ba9576439e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 11 Dec 2025 14:33:09 -0800 Subject: [PATCH 730/759] Better chroma radiance and other models vram estimation. (#11278) --- comfy/supported_models.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index ef8c75c09..834dfcffc 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -965,7 +965,7 @@ class CosmosT2IPredict2(supported_models_base.BASE): def __init__(self, unet_config): super().__init__(unet_config) - self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.9 + self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.95 def get_model(self, state_dict, prefix="", device=None): out = model_base.CosmosPredict2(self, device=device) @@ -1289,7 +1289,7 @@ class ChromaRadiance(Chroma): latent_format = comfy.latent_formats.ChromaRadiance # Pixel-space model, no spatial compression for model input. - memory_usage_factor = 0.038 + memory_usage_factor = 0.044 def get_model(self, state_dict, prefix="", device=None): return model_base.ChromaRadiance(self, device=device) @@ -1332,7 +1332,7 @@ class Omnigen2(supported_models_base.BASE): "shift": 2.6, } - memory_usage_factor = 1.65 #TODO + memory_usage_factor = 1.95 #TODO unet_extra_config = {} latent_format = latent_formats.Flux @@ -1397,7 +1397,7 @@ class HunyuanImage21(HunyuanVideo): latent_format = latent_formats.HunyuanImage21 - memory_usage_factor = 7.7 + memory_usage_factor = 8.7 supported_inference_dtypes = [torch.bfloat16, torch.float32] @@ -1488,7 +1488,7 @@ class Kandinsky5(supported_models_base.BASE): unet_extra_config = {} latent_format = latent_formats.HunyuanVideo - memory_usage_factor = 1.1 #TODO + memory_usage_factor = 1.25 #TODO supported_inference_dtypes = [torch.bfloat16, torch.float32] @@ -1517,7 +1517,7 @@ class Kandinsky5Image(Kandinsky5): } latent_format = latent_formats.Flux - memory_usage_factor = 1.1 #TODO + memory_usage_factor = 1.25 #TODO def get_model(self, state_dict, prefix="", device=None): out = model_base.Kandinsky5Image(self, device=device) From 338d9ae3bbf24a9a06996cdf1c2f228acc65fd96 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 11 Dec 2025 15:56:33 -0800 Subject: [PATCH 731/759] Make portable updater work with repos in unmerged state. (#11281) --- .ci/update_windows/update.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.ci/update_windows/update.py b/.ci/update_windows/update.py index 59ece5130..fe646a6ed 100755 --- a/.ci/update_windows/update.py +++ b/.ci/update_windows/update.py @@ -53,6 +53,16 @@ try: repo.stash(ident) except KeyError: print("nothing to stash") # noqa: T201 +except: + print("Could not stash, cleaning index and trying again.") # noqa: T201 + repo.state_cleanup() + repo.index.read_tree(repo.head.peel().tree) + repo.index.write() + try: + repo.stash(ident) + except KeyError: + print("nothing to stash.") # noqa: T201 + backup_branch_name = 'backup_branch_{}'.format(datetime.today().strftime('%Y-%m-%d_%H_%M_%S')) print("creating backup branch: {}".format(backup_branch_name)) # noqa: T201 try: From 982876d59a659adb085be5e236aacc4f2c54c19c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Fri, 12 Dec 2025 05:29:34 +0200 Subject: [PATCH 732/759] WanMove support (#11247) --- comfy_api/latest/_io.py | 8 + comfy_extras/nodes_wanmove.py | 535 ++++++++++++++++++++++++++++++++++ nodes.py | 1 + 3 files changed, 544 insertions(+) create mode 100644 comfy_extras/nodes_wanmove.py diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 79217c813..2b634d172 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -774,6 +774,13 @@ class AudioEncoder(ComfyTypeIO): class AudioEncoderOutput(ComfyTypeIO): Type = Any +@comfytype(io_type="TRACKS") +class Tracks(ComfyTypeIO): + class TrackDict(TypedDict): + track_path: torch.Tensor + track_visibility: torch.Tensor + Type = TrackDict + @comfytype(io_type="COMFY_MULTITYPED_V3") class MultiType: Type = Any @@ -1894,6 +1901,7 @@ __all__ = [ "SEGS", "AnyType", "MultiType", + "Tracks", # Dynamic Types "MatchType", # "DynamicCombo", diff --git a/comfy_extras/nodes_wanmove.py b/comfy_extras/nodes_wanmove.py new file mode 100644 index 000000000..5f39afa46 --- /dev/null +++ b/comfy_extras/nodes_wanmove.py @@ -0,0 +1,535 @@ +import nodes +import node_helpers +import torch +import torchvision.transforms.functional as TF +import comfy.model_management +import comfy.utils +import numpy as np +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io +from comfy_extras.nodes_wan import parse_json_tracks + +# https://github.com/ali-vilab/Wan-Move/blob/main/wan/modules/trajectory.py +from PIL import Image, ImageDraw + +SKIP_ZERO = False + +def get_pos_emb( + pos_k: torch.Tensor, # A 1D tensor containing positions for which to generate embeddings. + pos_emb_dim: int, + theta_func: callable = lambda i, d: torch.pow(10000, torch.mul(2, torch.div(i.to(torch.float32), d))), #Function to compute thetas based on position and embedding dimensions. + device: torch.device = torch.device("cpu"), + dtype: torch.dtype = torch.float32, +) -> torch.Tensor: # The position embeddings (batch_size, pos_emb_dim) + + assert pos_emb_dim % 2 == 0, "The dimension of position embeddings must be even." + pos_k = pos_k.to(device, dtype) + if SKIP_ZERO: + pos_k = pos_k + 1 + batch_size = pos_k.size(0) + + denominator = torch.arange(0, pos_emb_dim // 2, device=device, dtype=dtype) + # Expand denominator to match the shape needed for broadcasting + denominator_expanded = denominator.view(1, -1).expand(batch_size, -1) + + thetas = theta_func(denominator_expanded, pos_emb_dim) + + # Ensure pos_k is in the correct shape for broadcasting + pos_k_expanded = pos_k.view(-1, 1).to(dtype) + sin_thetas = torch.sin(torch.div(pos_k_expanded, thetas)) + cos_thetas = torch.cos(torch.div(pos_k_expanded, thetas)) + + # Concatenate sine and cosine embeddings along the last dimension + pos_emb = torch.cat([sin_thetas, cos_thetas], dim=-1) + + return pos_emb + +def create_pos_embeddings( + pred_tracks: torch.Tensor, # the predicted tracks, [T, N, 2] + pred_visibility: torch.Tensor, # the predicted visibility [T, N] + downsample_ratios: list[int], # the ratios for downsampling time, height, and width + height: int, # the height of the feature map + width: int, # the width of the feature map + track_num: int = -1, # the number of tracks to use + t_down_strategy: str = "sample", # the strategy for downsampling time dimension +): + assert t_down_strategy in ["sample", "average"], "Invalid strategy for downsampling time dimension." + + t, n, _ = pred_tracks.shape + t_down, h_down, w_down = downsample_ratios + track_pos = - torch.ones(n, (t-1) // t_down + 1, 2, dtype=torch.long) + + if track_num == -1: + track_num = n + + tracks_idx = torch.randperm(n)[:track_num] + tracks = pred_tracks[:, tracks_idx] + visibility = pred_visibility[:, tracks_idx] + + for t_idx in range(0, t, t_down): + if t_down_strategy == "sample" or t_idx == 0: + cur_tracks = tracks[t_idx] # [N, 2] + cur_visibility = visibility[t_idx] # [N] + else: + cur_tracks = tracks[t_idx:t_idx+t_down].mean(dim=0) + cur_visibility = torch.any(visibility[t_idx:t_idx+t_down], dim=0) + + for i in range(track_num): + if not cur_visibility[i] or cur_tracks[i][0] < 0 or cur_tracks[i][1] < 0 or cur_tracks[i][0] >= width or cur_tracks[i][1] >= height: + continue + x, y = cur_tracks[i] + x, y = int(x // w_down), int(y // h_down) + track_pos[i, t_idx // t_down, 0], track_pos[i, t_idx // t_down, 1] = y, x + + return track_pos # the position embeddings, [N, T', 2], 2 = height, width + +def replace_feature( + vae_feature: torch.Tensor, # [B, C', T', H', W'] + track_pos: torch.Tensor, # [B, N, T', 2] + strength: float = 1.0 +) -> torch.Tensor: + b, _, t, h, w = vae_feature.shape + assert b == track_pos.shape[0], "Batch size mismatch." + n = track_pos.shape[1] + + # Shuffle the trajectory order + track_pos = track_pos[:, torch.randperm(n), :, :] + + # Extract coordinates at time steps ≥ 1 and generate a valid mask + current_pos = track_pos[:, :, 1:, :] # [B, N, T-1, 2] + mask = (current_pos[..., 0] >= 0) & (current_pos[..., 1] >= 0) # [B, N, T-1] + + # Get all valid indices + valid_indices = mask.nonzero(as_tuple=False) # [num_valid, 3] + num_valid = valid_indices.shape[0] + + if num_valid == 0: + return vae_feature + + # Decompose valid indices into each dimension + batch_idx = valid_indices[:, 0] + track_idx = valid_indices[:, 1] + t_rel = valid_indices[:, 2] + t_target = t_rel + 1 # Convert to original time step indices + + # Extract target position coordinates + h_target = current_pos[batch_idx, track_idx, t_rel, 0].long() # Ensure integer indices + w_target = current_pos[batch_idx, track_idx, t_rel, 1].long() + + # Extract source position coordinates (t=0) + h_source = track_pos[batch_idx, track_idx, 0, 0].long() + w_source = track_pos[batch_idx, track_idx, 0, 1].long() + + # Get source features and assign to target positions + src_features = vae_feature[batch_idx, :, 0, h_source, w_source] + dst_features = vae_feature[batch_idx, :, t_target, h_target, w_target] + + vae_feature[batch_idx, :, t_target, h_target, w_target] = dst_features + (src_features - dst_features) * strength + + + return vae_feature + +# Visualize functions + +def _draw_gradient_polyline_on_overlay(overlay, line_width, points, start_color, opacity=1.0): + draw = ImageDraw.Draw(overlay, 'RGBA') + points = points[::-1] + + # Compute total length + total_length = 0 + segment_lengths = [] + for i in range(len(points) - 1): + dx = points[i + 1][0] - points[i][0] + dy = points[i + 1][1] - points[i][1] + length = (dx * dx + dy * dy) ** 0.5 + segment_lengths.append(length) + total_length += length + + if total_length == 0: + return + + accumulated_length = 0 + + # Draw the gradient polyline + for idx, (start_point, end_point) in enumerate(zip(points[:-1], points[1:])): + segment_length = segment_lengths[idx] + steps = max(int(segment_length), 1) + + for i in range(steps): + current_length = accumulated_length + (i / steps) * segment_length + ratio = current_length / total_length + + alpha = int(255 * (1 - ratio) * opacity) + color = (*start_color, alpha) + + x = int(start_point[0] + (end_point[0] - start_point[0]) * i / steps) + y = int(start_point[1] + (end_point[1] - start_point[1]) * i / steps) + + dynamic_line_width = max(int(line_width * (1 - ratio)), 1) + draw.line([(x, y), (x + 1, y)], fill=color, width=dynamic_line_width) + + accumulated_length += segment_length + + +def add_weighted(rgb, track): + rgb = np.array(rgb) # [H, W, C] "RGB" + track = np.array(track) # [H, W, C] "RGBA" + + alpha = track[:, :, 3] / 255.0 + alpha = np.stack([alpha] * 3, axis=-1) + blend_img = track[:, :, :3] * alpha + rgb * (1 - alpha) + + return Image.fromarray(blend_img.astype(np.uint8)) + +def draw_tracks_on_video(video, tracks, visibility=None, track_frame=24, circle_size=12, opacity=0.5, line_width=16): + color_map = [(102, 153, 255), (0, 255, 255), (255, 255, 0), (255, 102, 204), (0, 255, 0)] + + video = video.byte().cpu().numpy() # (81, 480, 832, 3) + tracks = tracks[0].long().detach().cpu().numpy() + if visibility is not None: + visibility = visibility[0].detach().cpu().numpy() + + num_frames, height, width = video.shape[:3] + num_tracks = tracks.shape[1] + alpha_opacity = int(255 * opacity) + + output_frames = [] + for t in range(num_frames): + frame_rgb = video[t].astype(np.float32) + + # Create a single RGBA overlay for all tracks in this frame + overlay = Image.new("RGBA", (width, height), (0, 0, 0, 0)) + draw_overlay = ImageDraw.Draw(overlay) + + polyline_data = [] + + # Draw all circles on a single overlay + for n in range(num_tracks): + if visibility is not None and visibility[t, n] == 0: + continue + + track_coord = tracks[t, n] + color = color_map[n % len(color_map)] + circle_color = color + (alpha_opacity,) + + draw_overlay.ellipse((track_coord[0] - circle_size, track_coord[1] - circle_size, track_coord[0] + circle_size, track_coord[1] + circle_size), + fill=circle_color + ) + + # Store polyline data for batch processing + tracks_coord = tracks[max(t - track_frame, 0):t + 1, n] + if len(tracks_coord) > 1: + polyline_data.append((tracks_coord, color)) + + # Blend circles overlay once + overlay_np = np.array(overlay) + alpha = overlay_np[:, :, 3:4] / 255.0 + frame_rgb = overlay_np[:, :, :3] * alpha + frame_rgb * (1 - alpha) + + # Draw all polylines on a single overlay + if polyline_data: + polyline_overlay = Image.new("RGBA", (width, height), (0, 0, 0, 0)) + for tracks_coord, color in polyline_data: + _draw_gradient_polyline_on_overlay(polyline_overlay, line_width, tracks_coord, color, opacity) + + # Blend polylines overlay once + polyline_np = np.array(polyline_overlay) + alpha = polyline_np[:, :, 3:4] / 255.0 + frame_rgb = polyline_np[:, :, :3] * alpha + frame_rgb * (1 - alpha) + + output_frames.append(Image.fromarray(frame_rgb.astype(np.uint8))) + + return output_frames + + +class WanMoveVisualizeTracks(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanMoveVisualizeTracks", + category="conditioning/video_models", + inputs=[ + io.Image.Input("images"), + io.Tracks.Input("tracks", optional=True), + io.Int.Input("line_resolution", default=24, min=1, max=1024), + io.Int.Input("circle_size", default=12, min=1, max=128), + io.Float.Input("opacity", default=0.75, min=0.0, max=1.0, step=0.01), + io.Int.Input("line_width", default=16, min=1, max=128), + ], + outputs=[ + io.Image.Output(), + ], + ) + + @classmethod + def execute(cls, images, line_resolution, circle_size, opacity, line_width, tracks=None) -> io.NodeOutput: + if tracks is None: + return io.NodeOutput(images) + + track_path = tracks["track_path"].unsqueeze(0) + track_visibility = tracks["track_visibility"].unsqueeze(0) + images_in = images * 255.0 + if images_in.shape[0] != track_path.shape[1]: + repeat_count = track_path.shape[1] // images.shape[0] + images_in = images_in.repeat(repeat_count, 1, 1, 1) + track_video = draw_tracks_on_video(images_in, track_path, track_visibility, track_frame=line_resolution, circle_size=circle_size, opacity=opacity, line_width=line_width) + track_video = torch.stack([TF.to_tensor(frame) for frame in track_video], dim=0).movedim(1, -1).float() + + return io.NodeOutput(track_video.to(comfy.model_management.intermediate_device())) + + +class WanMoveTracksFromCoords(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanMoveTracksFromCoords", + category="conditioning/video_models", + inputs=[ + io.String.Input("track_coords", force_input=True, default="[]", optional=True), + io.Mask.Input("track_mask", optional=True), + ], + outputs=[ + io.Tracks.Output(), + io.Int.Output(display_name="track_length"), + ], + ) + + @classmethod + def execute(cls, track_coords, track_mask=None) -> io.NodeOutput: + device=comfy.model_management.intermediate_device() + + tracks_data = parse_json_tracks(track_coords) + track_length = len(tracks_data[0]) + + track_list = [ + [[track[frame]['x'], track[frame]['y']] for track in tracks_data] + for frame in range(len(tracks_data[0])) + ] + tracks = torch.tensor(track_list, dtype=torch.float32, device=device) # [frames, num_tracks, 2] + + num_tracks = tracks.shape[-2] + if track_mask is None: + track_visibility = torch.ones((track_length, num_tracks), dtype=torch.bool, device=device) + else: + track_visibility = (track_mask > 0).any(dim=(1, 2)).unsqueeze(-1) + + out_track_info = {} + out_track_info["track_path"] = tracks + out_track_info["track_visibility"] = track_visibility + return io.NodeOutput(out_track_info, track_length) + + +class GenerateTracks(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="GenerateTracks", + category="conditioning/video_models", + inputs=[ + io.Int.Input("width", default=832, min=16, max=4096, step=16), + io.Int.Input("height", default=480, min=16, max=4096, step=16), + io.Float.Input("start_x", default=0.0, min=0.0, max=1.0, step=0.01, tooltip="Normalized X coordinate (0-1) for start position."), + io.Float.Input("start_y", default=0.0, min=0.0, max=1.0, step=0.01, tooltip="Normalized Y coordinate (0-1) for start position."), + io.Float.Input("end_x", default=1.0, min=0.0, max=1.0, step=0.01, tooltip="Normalized X coordinate (0-1) for end position."), + io.Float.Input("end_y", default=1.0, min=0.0, max=1.0, step=0.01, tooltip="Normalized Y coordinate (0-1) for end position."), + io.Int.Input("num_frames", default=81, min=1, max=1024), + io.Int.Input("num_tracks", default=5, min=1, max=100), + io.Float.Input("track_spread", default=0.025, min=0.0, max=1.0, step=0.001, tooltip="Normalized distance between tracks. Tracks are spread perpendicular to the motion direction."), + io.Boolean.Input("bezier", default=False, tooltip="Enable Bezier curve path using the mid point as control point."), + io.Float.Input("mid_x", default=0.5, min=0.0, max=1.0, step=0.01, tooltip="Normalized X control point for Bezier curve. Only used when 'bezier' is enabled."), + io.Float.Input("mid_y", default=0.5, min=0.0, max=1.0, step=0.01, tooltip="Normalized Y control point for Bezier curve. Only used when 'bezier' is enabled."), + io.Combo.Input( + "interpolation", + options=["linear", "ease_in", "ease_out", "ease_in_out", "constant"], + tooltip="Controls the timing/speed of movement along the path.", + ), + io.Mask.Input("track_mask", optional=True, tooltip="Optional mask to indicate visible frames."), + ], + outputs=[ + io.Tracks.Output(), + io.Int.Output(display_name="track_length"), + ], + ) + + @classmethod + def execute(cls, width, height, start_x, start_y, mid_x, mid_y, end_x, end_y, num_frames, num_tracks, + track_spread, bezier=False, interpolation="linear", track_mask=None) -> io.NodeOutput: + device = comfy.model_management.intermediate_device() + track_length = num_frames + + # normalized coordinates to pixel coordinates + start_x_px = start_x * width + start_y_px = start_y * height + mid_x_px = mid_x * width + mid_y_px = mid_y * height + end_x_px = end_x * width + end_y_px = end_y * height + + track_spread_px = track_spread * (width + height) / 2 # Use average of width/height for spread to keep it proportional + + t = torch.linspace(0, 1, num_frames, device=device) + if interpolation == "constant": # All points stay at start position + interp_values = torch.zeros_like(t) + elif interpolation == "linear": + interp_values = t + elif interpolation == "ease_in": + interp_values = t ** 2 + elif interpolation == "ease_out": + interp_values = 1 - (1 - t) ** 2 + elif interpolation == "ease_in_out": + interp_values = t * t * (3 - 2 * t) + + if bezier: # apply interpolation to t for timing control along the bezier path + t_interp = interp_values + one_minus_t = 1 - t_interp + x_positions = one_minus_t ** 2 * start_x_px + 2 * one_minus_t * t_interp * mid_x_px + t_interp ** 2 * end_x_px + y_positions = one_minus_t ** 2 * start_y_px + 2 * one_minus_t * t_interp * mid_y_px + t_interp ** 2 * end_y_px + tangent_x = 2 * one_minus_t * (mid_x_px - start_x_px) + 2 * t_interp * (end_x_px - mid_x_px) + tangent_y = 2 * one_minus_t * (mid_y_px - start_y_px) + 2 * t_interp * (end_y_px - mid_y_px) + else: # calculate base x and y positions for each frame (center track) + x_positions = start_x_px + (end_x_px - start_x_px) * interp_values + y_positions = start_y_px + (end_y_px - start_y_px) * interp_values + # For non-bezier, tangent is constant (direction from start to end) + tangent_x = torch.full_like(t, end_x_px - start_x_px) + tangent_y = torch.full_like(t, end_y_px - start_y_px) + + track_list = [] + for frame_idx in range(num_frames): + # Calculate perpendicular direction at this frame + tx = tangent_x[frame_idx].item() + ty = tangent_y[frame_idx].item() + length = (tx ** 2 + ty ** 2) ** 0.5 + + if length > 0: # Perpendicular unit vector (rotate 90 degrees) + perp_x = -ty / length + perp_y = tx / length + else: # If tangent is zero, spread horizontally + perp_x = 1.0 + perp_y = 0.0 + + frame_tracks = [] + for track_idx in range(num_tracks): # center tracks around the main path offset ranges from -(num_tracks-1)/2 to +(num_tracks-1)/2 + offset = (track_idx - (num_tracks - 1) / 2) * track_spread_px + track_x = x_positions[frame_idx].item() + perp_x * offset + track_y = y_positions[frame_idx].item() + perp_y * offset + frame_tracks.append([track_x, track_y]) + track_list.append(frame_tracks) + + tracks = torch.tensor(track_list, dtype=torch.float32, device=device) # [frames, num_tracks, 2] + + if track_mask is None: + track_visibility = torch.ones((track_length, num_tracks), dtype=torch.bool, device=device) + else: + track_visibility = (track_mask > 0).any(dim=(1, 2)).unsqueeze(-1) + + out_track_info = {} + out_track_info["track_path"] = tracks + out_track_info["track_visibility"] = track_visibility + return io.NodeOutput(out_track_info, track_length) + + +class WanMoveConcatTrack(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanMoveConcatTrack", + category="conditioning/video_models", + inputs=[ + io.Tracks.Input("tracks_1"), + io.Tracks.Input("tracks_2", optional=True), + ], + outputs=[ + io.Tracks.Output(), + ], + ) + + @classmethod + def execute(cls, tracks_1=None, tracks_2=None) -> io.NodeOutput: + if tracks_2 is None: + return io.NodeOutput(tracks_1) + + tracks_out = torch.cat([tracks_1["track_path"], tracks_2["track_path"]], dim=1) # Concatenate along the track dimension + mask_out = torch.cat([tracks_1["track_visibility"], tracks_2["track_visibility"]], dim=-1) + + out_track_info = {} + out_track_info["track_path"] = tracks_out + out_track_info["track_visibility"] = mask_out + return io.NodeOutput(out_track_info) + + +class WanMoveTrackToVideo(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanMoveTrackToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Tracks.Input("tracks", optional=True), + io.Float.Input("strength", default=1.0, min=0.0, max=100.0, step=0.01, tooltip="Strength of the track conditioning."), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("start_image"), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, strength, tracks=None, start_image=None, clip_vision_output=None) -> io.NodeOutput: + device=comfy.model_management.intermediate_device() + latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=device) + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + image = torch.ones((length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) * 0.5 + image[:start_image.shape[0]] = start_image + + concat_latent_image = vae.encode(image[:, :, :, :3]) + mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) + mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0 + + if tracks is not None and strength > 0.0: + tracks_path = tracks["track_path"][:length] # [T, N, 2] + num_tracks = tracks_path.shape[-2] + + track_visibility = tracks.get("track_visibility", torch.ones((length, num_tracks), dtype=torch.bool, device=device)) + + track_pos = create_pos_embeddings(tracks_path, track_visibility, [4, 8, 8], height, width, track_num=num_tracks) + track_pos = comfy.utils.resize_to_batch_size(track_pos.unsqueeze(0), batch_size) + concat_latent_image_pos = replace_feature(concat_latent_image, track_pos, strength) + else: + concat_latent_image_pos = concat_latent_image + + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image_pos, "concat_mask": mask}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) + + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent) + + +class WanMoveExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + WanMoveTrackToVideo, + WanMoveTracksFromCoords, + WanMoveConcatTrack, + WanMoveVisualizeTracks, + GenerateTracks, + ] + +async def comfy_entrypoint() -> WanMoveExtension: + return WanMoveExtension() diff --git a/nodes.py b/nodes.py index 8d28a725d..8678f510a 100644 --- a/nodes.py +++ b/nodes.py @@ -2358,6 +2358,7 @@ async def init_builtin_extra_nodes(): "nodes_logic.py", "nodes_nop.py", "nodes_kandinsky5.py", + "nodes_wanmove.py", ] import_failed = [] From 5495589db38409353a85b06df7d10f8de2f9c78d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 11 Dec 2025 20:32:27 -0800 Subject: [PATCH 733/759] Respect the dtype the op was initialized in for non quant mixed op. (#11282) --- comfy/ops.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 6f34d50fc..6ae6e791a 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -497,8 +497,10 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec ) -> None: super().__init__() - self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype} - # self.factory_kwargs = {"device": device, "dtype": dtype} + if dtype is None: + dtype = MixedPrecisionOps._compute_dtype + + self.factory_kwargs = {"device": device, "dtype": dtype} self.in_features = in_features self.out_features = out_features @@ -530,7 +532,10 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec layer_conf = json.loads(layer_conf.numpy().tobytes()) if layer_conf is None: - self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False) + dtype = self.factory_kwargs["dtype"] + self.weight = torch.nn.Parameter(weight.to(device=device, dtype=dtype), requires_grad=False) + if dtype != MixedPrecisionOps._compute_dtype: + self.comfy_cast_weights = True else: self.quant_format = layer_conf.get("format", None) if not self._full_precision_mm: From 908fd7d7496f6de88722263e1e00fcd3d22e584f Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 12 Dec 2025 10:18:31 +0200 Subject: [PATCH 734/759] feat(api-nodes): new TextToVideoWithAudio and ImageToVideoWithAudio nodes (#11267) --- comfy_api_nodes/apis/kling_api.py | 28 ++++- comfy_api_nodes/nodes_kling.py | 169 ++++++++++++++++++++++++++---- 2 files changed, 174 insertions(+), 23 deletions(-) diff --git a/comfy_api_nodes/apis/kling_api.py b/comfy_api_nodes/apis/kling_api.py index d8949f8ac..80a758466 100644 --- a/comfy_api_nodes/apis/kling_api.py +++ b/comfy_api_nodes/apis/kling_api.py @@ -51,25 +51,25 @@ class TaskStatusImageResult(BaseModel): url: str = Field(..., description="URL for generated image") -class OmniTaskStatusResults(BaseModel): +class TaskStatusResults(BaseModel): videos: list[TaskStatusVideoResult] | None = Field(None) images: list[TaskStatusImageResult] | None = Field(None) -class OmniTaskStatusResponseData(BaseModel): +class TaskStatusResponseData(BaseModel): created_at: int | None = Field(None, description="Task creation time") updated_at: int | None = Field(None, description="Task update time") task_status: str | None = None task_status_msg: str | None = Field(None, description="Additional failure reason. Only for polling endpoint.") task_id: str | None = Field(None, description="Task ID") - task_result: OmniTaskStatusResults | None = Field(None) + task_result: TaskStatusResults | None = Field(None) -class OmniTaskStatusResponse(BaseModel): +class TaskStatusResponse(BaseModel): code: int | None = Field(None, description="Error code") message: str | None = Field(None, description="Error message") request_id: str | None = Field(None, description="Request ID") - data: OmniTaskStatusResponseData | None = Field(None) + data: TaskStatusResponseData | None = Field(None) class OmniImageParamImage(BaseModel): @@ -84,3 +84,21 @@ class OmniProImageRequest(BaseModel): mode: str = Field("pro") n: int | None = Field(1, le=9) image_list: list[OmniImageParamImage] | None = Field(..., max_length=10) + + +class TextToVideoWithAudioRequest(BaseModel): + model_name: str = Field(..., description="kling-v2-6") + aspect_ratio: str = Field(..., description="'16:9', '9:16' or '1:1'") + duration: str = Field(..., description="'5' or '10'") + prompt: str = Field(...) + mode: str = Field("pro") + sound: str = Field(..., description="'on' or 'off'") + + +class ImageToVideoWithAudioRequest(BaseModel): + model_name: str = Field(..., description="kling-v2-6") + image: str = Field(...) + duration: str = Field(..., description="'5' or '10'") + prompt: str = Field(...) + mode: str = Field("pro") + sound: str = Field(..., description="'on' or 'off'") diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index a2cc87d84..e545fe490 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -50,6 +50,7 @@ from comfy_api_nodes.apis import ( KlingSingleImageEffectModelName, ) from comfy_api_nodes.apis.kling_api import ( + ImageToVideoWithAudioRequest, OmniImageParamImage, OmniParamImage, OmniParamVideo, @@ -57,7 +58,8 @@ from comfy_api_nodes.apis.kling_api import ( OmniProImageRequest, OmniProReferences2VideoRequest, OmniProText2VideoRequest, - OmniTaskStatusResponse, + TaskStatusResponse, + TextToVideoWithAudioRequest, ) from comfy_api_nodes.util import ( ApiEndpoint, @@ -242,7 +244,7 @@ def normalize_omni_prompt_references(prompt: str) -> str: return re.sub(r"(?\d*)(?!\w)", _video_repl, prompt) -async def finish_omni_video_task(cls: type[IO.ComfyNode], response: OmniTaskStatusResponse) -> IO.NodeOutput: +async def finish_omni_video_task(cls: type[IO.ComfyNode], response: TaskStatusResponse) -> IO.NodeOutput: if response.code: raise RuntimeError( f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}" @@ -250,7 +252,7 @@ async def finish_omni_video_task(cls: type[IO.ComfyNode], response: OmniTaskStat final_response = await poll_op( cls, ApiEndpoint(path=f"/proxy/kling/v1/videos/omni-video/{response.data.task_id}"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, status_extractor=lambda r: (r.data.task_status if r.data else None), max_poll_attempts=160, ) @@ -483,12 +485,12 @@ async def execute_image2video( task_id = task_creation_response.data.task_id final_response = await poll_op( - cls, - ApiEndpoint(path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}"), - response_model=KlingImage2VideoResponse, - estimated_duration=AVERAGE_DURATION_I2V, - status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), - ) + cls, + ApiEndpoint(path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}"), + response_model=KlingImage2VideoResponse, + estimated_duration=AVERAGE_DURATION_I2V, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), + ) validate_video_result_response(final_response) video = get_video_from_response(final_response) @@ -834,7 +836,7 @@ class OmniProTextToVideoNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, data=OmniProText2VideoRequest( model_name=model_name, prompt=prompt, @@ -929,7 +931,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, data=OmniProFirstLastFrameRequest( model_name=model_name, prompt=prompt, @@ -997,7 +999,7 @@ class OmniProImageToVideoNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, data=OmniProReferences2VideoRequest( model_name=model_name, prompt=prompt, @@ -1081,7 +1083,7 @@ class OmniProVideoToVideoNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, data=OmniProReferences2VideoRequest( model_name=model_name, prompt=prompt, @@ -1162,7 +1164,7 @@ class OmniProEditVideoNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, data=OmniProReferences2VideoRequest( model_name=model_name, prompt=prompt, @@ -1237,7 +1239,7 @@ class OmniProImageNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/images/omni-image", method="POST"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, data=OmniProImageRequest( model_name=model_name, prompt=prompt, @@ -1253,7 +1255,7 @@ class OmniProImageNode(IO.ComfyNode): final_response = await poll_op( cls, ApiEndpoint(path=f"/proxy/kling/v1/images/omni-image/{response.data.task_id}"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, status_extractor=lambda r: (r.data.task_status if r.data else None), ) return IO.NodeOutput(await download_url_to_image_tensor(final_response.data.task_result.images[0].url)) @@ -1328,9 +1330,8 @@ class KlingImage2VideoNode(IO.ComfyNode): def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="KlingImage2VideoNode", - display_name="Kling Image to Video", + display_name="Kling Image(First Frame) to Video", category="api node/video/Kling", - description="Kling Image to Video Node", inputs=[ IO.Image.Input("start_frame", tooltip="The reference image used to generate the video."), IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), @@ -2034,6 +2035,136 @@ class KlingImageGenerationNode(IO.ComfyNode): return IO.NodeOutput(await image_result_to_node_output(images)) +class TextToVideoWithAudio(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingTextToVideoWithAudio", + display_name="Kling Text to Video with Audio", + category="api node/video/Kling", + inputs=[ + IO.Combo.Input("model_name", options=["kling-v2-6"]), + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt."), + IO.Combo.Input("mode", options=["pro"]), + IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]), + IO.Combo.Input("duration", options=[5, 10]), + IO.Boolean.Input("generate_audio", default=True), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model_name: str, + prompt: str, + mode: str, + aspect_ratio: str, + duration: int, + generate_audio: bool, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=2500) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/kling/v1/videos/text2video", method="POST"), + response_model=TaskStatusResponse, + data=TextToVideoWithAudioRequest( + model_name=model_name, + prompt=prompt, + mode=mode, + aspect_ratio=aspect_ratio, + duration=str(duration), + sound="on" if generate_audio else "off", + ), + ) + if response.code: + raise RuntimeError( + f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}" + ) + final_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/kling/v1/videos/text2video/{response.data.task_id}"), + response_model=TaskStatusResponse, + status_extractor=lambda r: (r.data.task_status if r.data else None), + ) + return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url)) + + +class ImageToVideoWithAudio(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingImageToVideoWithAudio", + display_name="Kling Image(First Frame) to Video with Audio", + category="api node/video/Kling", + inputs=[ + IO.Combo.Input("model_name", options=["kling-v2-6"]), + IO.Image.Input("start_frame"), + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt."), + IO.Combo.Input("mode", options=["pro"]), + IO.Combo.Input("duration", options=[5, 10]), + IO.Boolean.Input("generate_audio", default=True), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model_name: str, + start_frame: Input.Image, + prompt: str, + mode: str, + duration: int, + generate_audio: bool, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=2500) + validate_image_dimensions(start_frame, min_width=300, min_height=300) + validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1)) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/kling/v1/videos/image2video", method="POST"), + response_model=TaskStatusResponse, + data=ImageToVideoWithAudioRequest( + model_name=model_name, + image=(await upload_images_to_comfyapi(cls, start_frame))[0], + prompt=prompt, + mode=mode, + duration=str(duration), + sound="on" if generate_audio else "off", + ), + ) + if response.code: + raise RuntimeError( + f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}" + ) + final_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/kling/v1/videos/image2video/{response.data.task_id}"), + response_model=TaskStatusResponse, + status_extractor=lambda r: (r.data.task_status if r.data else None), + ) + return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url)) + + class KlingExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: @@ -2057,6 +2188,8 @@ class KlingExtension(ComfyExtension): OmniProVideoToVideoNode, OmniProEditVideoNode, OmniProImageNode, + TextToVideoWithAudio, + ImageToVideoWithAudio, ] From c5a47a16924e1be96241553a1448b298e57e50a1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 12 Dec 2025 08:49:35 -0800 Subject: [PATCH 735/759] Fix bias dtype issue in mixed ops. (#11293) --- comfy/ops.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 6ae6e791a..0384c8717 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -504,10 +504,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec self.in_features = in_features self.out_features = out_features - if bias: - self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs)) - else: - self.register_parameter("bias", None) + self._has_bias = bias self.tensor_class = None self._full_precision_mm = MixedPrecisionOps._full_precision_mm @@ -536,6 +533,10 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec self.weight = torch.nn.Parameter(weight.to(device=device, dtype=dtype), requires_grad=False) if dtype != MixedPrecisionOps._compute_dtype: self.comfy_cast_weights = True + if self._has_bias: + self.bias = torch.nn.Parameter(torch.empty(self.out_features, device=device, dtype=dtype)) + else: + self.register_parameter("bias", None) else: self.quant_format = layer_conf.get("format", None) if not self._full_precision_mm: @@ -565,6 +566,11 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec requires_grad=False ) + if self._has_bias: + self.bias = torch.nn.Parameter(torch.empty(self.out_features, device=device, dtype=MixedPrecisionOps._compute_dtype)) + else: + self.register_parameter("bias", None) + for param_name in qconfig["parameters"]: param_key = f"{prefix}{param_name}" _v = state_dict.pop(param_key, None) From da2bfb5b0af26c7a1c44ec951dbd0fffe413c793 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 12 Dec 2025 22:39:11 -0800 Subject: [PATCH 736/759] Basic implementation of z image fun control union 2.0 (#11304) The inpaint part is currently missing and will be implemented later. I think they messed up this model pretty bad. They added some control_noise_refiner blocks but don't actually use them. There is a typo in their code so instead of doing control_noise_refiner -> control_layers it runs the whole control_layers twice. Unfortunately they trained with this typo so the model works but is kind of slow and would probably perform a lot better if they corrected their code and trained it again. --- comfy/ldm/lumina/controlnet.py | 95 +++++++++++++++++++++++-------- comfy/ldm/lumina/model.py | 16 +++++- comfy/model_patcher.py | 3 + comfy_extras/nodes_model_patch.py | 72 +++++++++++++++++------ 4 files changed, 142 insertions(+), 44 deletions(-) diff --git a/comfy/ldm/lumina/controlnet.py b/comfy/ldm/lumina/controlnet.py index fd7ce3b5c..8e2de7977 100644 --- a/comfy/ldm/lumina/controlnet.py +++ b/comfy/ldm/lumina/controlnet.py @@ -41,6 +41,11 @@ class ZImage_Control(torch.nn.Module): ffn_dim_multiplier: float = (8.0 / 3.0), norm_eps: float = 1e-5, qk_norm: bool = True, + n_control_layers=6, + control_in_dim=16, + additional_in_dim=0, + broken=False, + refiner_control=False, dtype=None, device=None, operations=None, @@ -49,10 +54,11 @@ class ZImage_Control(torch.nn.Module): super().__init__() operation_settings = {"operations": operations, "device": device, "dtype": dtype} - self.additional_in_dim = 0 - self.control_in_dim = 16 + self.broken = broken + self.additional_in_dim = additional_in_dim + self.control_in_dim = control_in_dim n_refiner_layers = 2 - self.n_control_layers = 6 + self.n_control_layers = n_control_layers self.control_layers = nn.ModuleList( [ ZImageControlTransformerBlock( @@ -74,28 +80,49 @@ class ZImage_Control(torch.nn.Module): all_x_embedder = {} patch_size = 2 f_patch_size = 1 - x_embedder = operations.Linear(f_patch_size * patch_size * patch_size * self.control_in_dim, dim, bias=True, device=device, dtype=dtype) + x_embedder = operations.Linear(f_patch_size * patch_size * patch_size * (self.control_in_dim + self.additional_in_dim), dim, bias=True, device=device, dtype=dtype) all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder + self.refiner_control = refiner_control + self.control_all_x_embedder = nn.ModuleDict(all_x_embedder) - self.control_noise_refiner = nn.ModuleList( - [ - JointTransformerBlock( - layer_id, - dim, - n_heads, - n_kv_heads, - multiple_of, - ffn_dim_multiplier, - norm_eps, - qk_norm, - modulation=True, - z_image_modulation=True, - operation_settings=operation_settings, - ) - for layer_id in range(n_refiner_layers) - ] - ) + if self.refiner_control: + self.control_noise_refiner = nn.ModuleList( + [ + ZImageControlTransformerBlock( + layer_id, + dim, + n_heads, + n_kv_heads, + multiple_of, + ffn_dim_multiplier, + norm_eps, + qk_norm, + block_id=layer_id, + operation_settings=operation_settings, + ) + for layer_id in range(n_refiner_layers) + ] + ) + else: + self.control_noise_refiner = nn.ModuleList( + [ + JointTransformerBlock( + layer_id, + dim, + n_heads, + n_kv_heads, + multiple_of, + ffn_dim_multiplier, + norm_eps, + qk_norm, + modulation=True, + z_image_modulation=True, + operation_settings=operation_settings, + ) + for layer_id in range(n_refiner_layers) + ] + ) def forward(self, cap_feats, control_context, x_freqs_cis, adaln_input): patch_size = 2 @@ -105,9 +132,29 @@ class ZImage_Control(torch.nn.Module): control_context = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_context.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2)) x_attn_mask = None - for layer in self.control_noise_refiner: - control_context = layer(control_context, x_attn_mask, x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input) + if not self.refiner_control: + for layer in self.control_noise_refiner: + control_context = layer(control_context, x_attn_mask, x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input) + return control_context + def forward_noise_refiner_block(self, layer_id, control_context, x, x_attn_mask, x_freqs_cis, adaln_input): + if self.refiner_control: + if self.broken: + if layer_id == 0: + return self.control_layers[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input) + if layer_id > 0: + out = None + for i in range(1, len(self.control_layers)): + o, control_context = self.control_layers[i](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input) + if out is None: + out = o + + return (out, control_context) + else: + return self.control_noise_refiner[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input) + else: + return (None, control_context) + def forward_control_block(self, layer_id, control_context, x, x_attn_mask, x_freqs_cis, adaln_input): return self.control_layers[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input) diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index c47df49ca..96cb37fa6 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -536,6 +536,7 @@ class NextDiT(nn.Module): bsz = len(x) pH = pW = self.patch_size device = x[0].device + orig_x = x if self.pad_tokens_multiple is not None: pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple @@ -572,13 +573,21 @@ class NextDiT(nn.Module): freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2) + patches = transformer_options.get("patches", {}) + # refine context for layer in self.context_refiner: cap_feats = layer(cap_feats, cap_mask, freqs_cis[:, :cap_pos_ids.shape[1]], transformer_options=transformer_options) padded_img_mask = None - for layer in self.noise_refiner: + x_input = x + for i, layer in enumerate(self.noise_refiner): x = layer(x, padded_img_mask, freqs_cis[:, cap_pos_ids.shape[1]:], t, transformer_options=transformer_options) + if "noise_refiner" in patches: + for p in patches["noise_refiner"]: + out = p({"img": x, "img_input": x_input, "txt": cap_feats, "pe": freqs_cis[:, cap_pos_ids.shape[1]:], "vec": t, "x": orig_x, "block_index": i, "transformer_options": transformer_options, "block_type": "noise_refiner"}) + if "img" in out: + x = out["img"] padded_full_embed = torch.cat((cap_feats, x), dim=1) mask = None @@ -622,14 +631,15 @@ class NextDiT(nn.Module): patches = transformer_options.get("patches", {}) x_is_tensor = isinstance(x, torch.Tensor) - img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options) + img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, transformer_options=transformer_options) freqs_cis = freqs_cis.to(img.device) + img_input = img for i, layer in enumerate(self.layers): img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options) if "double_block" in patches: for p in patches["double_block"]: - out = p({"img": img[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options}) + out = p({"img": img[:, cap_size[0]:], "img_input": img_input[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options}) if "img" in out: img[:, cap_size[0]:] = out["img"] if "txt" in out: diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index a486c2723..93d26c690 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -454,6 +454,9 @@ class ModelPatcher: def set_model_post_input_patch(self, patch): self.set_model_patch(patch, "post_input") + def set_model_noise_refiner_patch(self, patch): + self.set_model_patch(patch, "noise_refiner") + def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs): rope_options = self.model_options["transformer_options"].get("rope_options", {}) rope_options["scale_x"] = scale_x diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index c61810dbf..ec0e790dc 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -243,7 +243,13 @@ class ModelPatchLoader: model = SigLIPMultiFeatProjModel(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) elif 'control_all_x_embedder.2-1.weight' in sd: # alipai z image fun controlnet sd = z_image_convert(sd) - model = comfy.ldm.lumina.controlnet.ZImage_Control(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + config = {} + if 'control_layers.14.adaLN_modulation.0.weight' in sd: + config['n_control_layers'] = 15 + config['additional_in_dim'] = 17 + config['refiner_control'] = True + config['broken'] = True + model = comfy.ldm.lumina.controlnet.ZImage_Control(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast, **config) model.load_state_dict(sd) model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device()) @@ -297,56 +303,86 @@ class DiffSynthCnetPatch: return [self.model_patch] class ZImageControlPatch: - def __init__(self, model_patch, vae, image, strength): + def __init__(self, model_patch, vae, image, strength, inpaint_image=None, mask=None): self.model_patch = model_patch self.vae = vae self.image = image + self.inpaint_image = inpaint_image + self.mask = mask self.strength = strength self.encoded_image = self.encode_latent_cond(image) self.encoded_image_size = (image.shape[1], image.shape[2]) self.temp_data = None - def encode_latent_cond(self, image): - latent_image = comfy.latent_formats.Flux().process_in(self.vae.encode(image)) - return latent_image + def encode_latent_cond(self, control_image, inpaint_image=None): + latent_image = comfy.latent_formats.Flux().process_in(self.vae.encode(control_image)) + if self.model_patch.model.additional_in_dim > 0: + if self.mask is None: + mask_ = torch.zeros_like(latent_image)[:, :1] + else: + mask_ = comfy.utils.common_upscale(self.mask.mean(dim=1, keepdim=True), latent_image.shape[-1], latent_image.shape[-2], "bilinear", "none") + if inpaint_image is None: + inpaint_image = torch.ones_like(control_image) * 0.5 + + inpaint_image_latent = comfy.latent_formats.Flux().process_in(self.vae.encode(inpaint_image)) + + return torch.cat([latent_image, mask_, inpaint_image_latent], dim=1) + else: + return latent_image def __call__(self, kwargs): x = kwargs.get("x") img = kwargs.get("img") + img_input = kwargs.get("img_input") txt = kwargs.get("txt") pe = kwargs.get("pe") vec = kwargs.get("vec") block_index = kwargs.get("block_index") + block_type = kwargs.get("block_type", "") spacial_compression = self.vae.spacial_compression_encode() if self.encoded_image is None or self.encoded_image_size != (x.shape[-2] * spacial_compression, x.shape[-1] * spacial_compression): image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center") + inpaint_scaled = None + if self.inpaint_image is not None: + inpaint_scaled = comfy.utils.common_upscale(self.inpaint_image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center").movedim(1, -1) loaded_models = comfy.model_management.loaded_models(only_currently_used=True) - self.encoded_image = self.encode_latent_cond(image_scaled.movedim(1, -1)) + self.encoded_image = self.encode_latent_cond(image_scaled.movedim(1, -1), inpaint_scaled) self.encoded_image_size = (image_scaled.shape[-2], image_scaled.shape[-1]) comfy.model_management.load_models_gpu(loaded_models) - cnet_index = (block_index // 5) - cnet_index_float = (block_index / 5) + cnet_blocks = self.model_patch.model.n_control_layers + div = round(30 / cnet_blocks) + + cnet_index = (block_index // div) + cnet_index_float = (block_index / div) kwargs.pop("img") # we do ops in place kwargs.pop("txt") - cnet_blocks = self.model_patch.model.n_control_layers if cnet_index_float > (cnet_blocks - 1): self.temp_data = None return kwargs if self.temp_data is None or self.temp_data[0] > cnet_index: - self.temp_data = (-1, (None, self.model_patch.model(txt, self.encoded_image.to(img.dtype), pe, vec))) + if block_type == "noise_refiner": + self.temp_data = (-3, (None, self.model_patch.model(txt, self.encoded_image.to(img.dtype), pe, vec))) + else: + self.temp_data = (-1, (None, self.model_patch.model(txt, self.encoded_image.to(img.dtype), pe, vec))) - while self.temp_data[0] < cnet_index and (self.temp_data[0] + 1) < cnet_blocks: + if block_type == "noise_refiner": next_layer = self.temp_data[0] + 1 - self.temp_data = (next_layer, self.model_patch.model.forward_control_block(next_layer, self.temp_data[1][1], img[:, :self.temp_data[1][1].shape[1]], None, pe, vec)) + self.temp_data = (next_layer, self.model_patch.model.forward_noise_refiner_block(block_index, self.temp_data[1][1], img_input[:, :self.temp_data[1][1].shape[1]], None, pe, vec)) + if self.temp_data[1][0] is not None: + img[:, :self.temp_data[1][0].shape[1]] += (self.temp_data[1][0] * self.strength) + else: + while self.temp_data[0] < cnet_index and (self.temp_data[0] + 1) < cnet_blocks: + next_layer = self.temp_data[0] + 1 + self.temp_data = (next_layer, self.model_patch.model.forward_control_block(next_layer, self.temp_data[1][1], img_input[:, :self.temp_data[1][1].shape[1]], None, pe, vec)) - if cnet_index_float == self.temp_data[0]: - img[:, :self.temp_data[1][0].shape[1]] += (self.temp_data[1][0] * self.strength) - if cnet_blocks == self.temp_data[0] + 1: - self.temp_data = None + if cnet_index_float == self.temp_data[0]: + img[:, :self.temp_data[1][0].shape[1]] += (self.temp_data[1][0] * self.strength) + if cnet_blocks == self.temp_data[0] + 1: + self.temp_data = None return kwargs @@ -386,7 +422,9 @@ class QwenImageDiffsynthControlnet: mask = 1.0 - mask if isinstance(model_patch.model, comfy.ldm.lumina.controlnet.ZImage_Control): - model_patched.set_model_double_block_patch(ZImageControlPatch(model_patch, vae, image, strength)) + patch = ZImageControlPatch(model_patch, vae, image, strength, mask=mask) + model_patched.set_model_noise_refiner_patch(patch) + model_patched.set_model_double_block_patch(patch) else: model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength, mask)) return (model_patched,) From 971cefe7d4ca15c949d5d901a663cb66562a4f10 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 13 Dec 2025 15:45:23 -0800 Subject: [PATCH 737/759] Fix pytorch warnings. (#11314) --- comfy/ops.py | 2 +- comfy/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 0384c8717..16889bb82 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -592,7 +592,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec quant_conf = {"format": self.quant_format} if self._full_precision_mm: quant_conf["full_precision_matrix_mult"] = True - sd["{}comfy_quant".format(prefix)] = torch.frombuffer(json.dumps(quant_conf).encode('utf-8'), dtype=torch.uint8) + sd["{}comfy_quant".format(prefix)] = torch.tensor(list(json.dumps(quant_conf).encode('utf-8')), dtype=torch.uint8) return sd def _forward(self, input, weight, bias): diff --git a/comfy/utils.py b/comfy/utils.py index 9dc0d76ac..3866cda2e 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -1262,6 +1262,6 @@ def convert_old_quants(state_dict, model_prefix="", metadata={}): if quant_metadata is not None: layers = quant_metadata["layers"] for k, v in layers.items(): - state_dict["{}.comfy_quant".format(k)] = torch.frombuffer(json.dumps(v).encode('utf-8'), dtype=torch.uint8) + state_dict["{}.comfy_quant".format(k)] = torch.tensor(list(json.dumps(v).encode('utf-8')), dtype=torch.uint8) return state_dict, metadata From 6592bffc609da4738b111dbffca1f473972f3574 Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Sun, 14 Dec 2025 13:03:29 +0800 Subject: [PATCH 738/759] seeds_2: add phi_2 variant and sampler node (#11309) * Add phi_2 solver type to seeds_2 * Add sampler node of seeds_2 --- comfy/k_diffusion/sampling.py | 15 ++++++++++++--- comfy_extras/nodes_custom_sampler.py | 26 ++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index 0e2cda291..753c66afa 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -1557,10 +1557,13 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None @torch.no_grad() -def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5): +def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5, solver_type="phi_1"): """SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 2. arXiv: https://arxiv.org/abs/2305.14267 (NeurIPS 2023) """ + if solver_type not in {"phi_1", "phi_2"}: + raise ValueError("solver_type must be 'phi_1' or 'phi_2'") + extra_args = {} if extra_args is None else extra_args seed = extra_args.get("seed", None) noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler @@ -1600,8 +1603,14 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args) # Step 2 - denoised_d = torch.lerp(denoised, denoised_2, fac) - x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d + if solver_type == "phi_1": + denoised_d = torch.lerp(denoised, denoised_2, fac) + x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d + elif solver_type == "phi_2": + b2 = ei_h_phi_2(-h_eta) / r + b1 = ei_h_phi_1(-h_eta) - b2 + x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * (b1 * denoised + b2 * denoised_2) + if inject_noise: segment_factor = (r - 1) * h * eta sde_noise = sde_noise * segment_factor.exp() diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index fbb080886..71ea4e9ec 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -659,6 +659,31 @@ class SamplerSASolver(io.ComfyNode): get_sampler = execute +class SamplerSEEDS2(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SamplerSEEDS2", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Combo.Input("solver_type", options=["phi_1", "phi_2"]), + io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, tooltip="Stochastic strength"), + io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False, tooltip="SDE noise multiplier"), + io.Float.Input("r", default=0.5, min=0.01, max=1.0, step=0.01, round=False, tooltip="Relative step size for the intermediate stage (c2 node)"), + ], + outputs=[io.Sampler.Output()] + ) + + @classmethod + def execute(cls, solver_type, eta, s_noise, r) -> io.NodeOutput: + sampler_name = "seeds_2" + sampler = comfy.samplers.ksampler( + sampler_name, + {"eta": eta, "s_noise": s_noise, "r": r, "solver_type": solver_type}, + ) + return io.NodeOutput(sampler) + + class Noise_EmptyNoise: def __init__(self): self.seed = 0 @@ -996,6 +1021,7 @@ class CustomSamplersExtension(ComfyExtension): SamplerDPMAdaptative, SamplerER_SDE, SamplerSASolver, + SamplerSEEDS2, SplitSigmas, SplitSigmasDenoise, FlipSigmas, From 5ac3b26a7dedb9b13c681abe8733c54f13353273 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 14 Dec 2025 01:02:50 -0800 Subject: [PATCH 739/759] Update warning for old pytorch version. (#11319) Versions below 2.4 are no longer supported. We will not break support on purpose but will not fix it if we do. --- comfy/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/utils.py b/comfy/utils.py index 3866cda2e..8d4e2b445 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -53,7 +53,7 @@ if hasattr(torch.serialization, "add_safe_globals"): # TODO: this was added in ALWAYS_SAFE_LOAD = True logging.info("Checkpoint files will always be loaded safely.") else: - logging.info("Warning, you are using an old pytorch version and some ckpt/pt files might be loaded unsafely. Upgrading to 2.4 or above is recommended.") + logging.warning("Warning, you are using an old pytorch version and some ckpt/pt files might be loaded unsafely. Upgrading to 2.4 or above is recommended as older versions of pytorch are no longer supported.") def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False): if device is None: From a5e85017d8574cb99024d320f7a53a77a9e6aa5a Mon Sep 17 00:00:00 2001 From: "Dr.Lt.Data" <128333288+ltdrdata@users.noreply.github.com> Date: Tue, 16 Dec 2025 04:24:01 +0900 Subject: [PATCH 740/759] bump manager requirments to the 4.0.3b5 (#11324) --- manager_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manager_requirements.txt b/manager_requirements.txt index b95cefb74..5ef0d3a1d 100644 --- a/manager_requirements.txt +++ b/manager_requirements.txt @@ -1 +1 @@ -comfyui_manager==4.0.3b4 +comfyui_manager==4.0.3b5 From 51347f9fb8a8e60d3add049c6f241822c84c8a87 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 16 Dec 2025 05:28:55 +0800 Subject: [PATCH 741/759] chore: update workflow templates to v0.7.59 (#11337) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9e9b25328..117260515 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.34.8 -comfyui-workflow-templates==0.7.54 +comfyui-workflow-templates==0.7.59 comfyui-embedded-docs==0.3.1 torch torchsde From 5cb1e0c9a0439f1f95a0b372474bd4845e38009c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 15 Dec 2025 13:49:29 -0800 Subject: [PATCH 742/759] Disable guards on transformer_options when torch.compile (#11317) --- comfy_extras/nodes_torch_compile.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_torch_compile.py b/comfy_extras/nodes_torch_compile.py index adbeece2f..c43e8ad63 100644 --- a/comfy_extras/nodes_torch_compile.py +++ b/comfy_extras/nodes_torch_compile.py @@ -2,6 +2,8 @@ from typing_extensions import override from comfy_api.latest import ComfyExtension, io from comfy_api.torch_helpers import set_torch_compile_wrapper +def skip_torch_compile_dict(guard_entries): + return [("transformer_options" not in entry.name) for entry in guard_entries] class TorchCompileModel(io.ComfyNode): @classmethod @@ -23,7 +25,7 @@ class TorchCompileModel(io.ComfyNode): @classmethod def execute(cls, model, backend) -> io.NodeOutput: m = model.clone() - set_torch_compile_wrapper(model=m, backend=backend) + set_torch_compile_wrapper(model=m, backend=backend, options={"guard_filter_fn": skip_torch_compile_dict}) return io.NodeOutput(m) From af91eb6c9931d0a2c99cf8a6d4974a6abf9a09fa Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 16 Dec 2025 01:30:24 +0200 Subject: [PATCH 743/759] api-nodes: drop Kling v1 model (#11307) --- comfy_api_nodes/nodes_kling.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index e545fe490..1a6364fa0 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -105,10 +105,6 @@ AVERAGE_DURATION_VIDEO_EXTEND = 320 MODE_TEXT2VIDEO = { - "standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"), - "standard mode / 10s duration / kling-v1": ("std", "10", "kling-v1"), - "pro mode / 5s duration / kling-v1": ("pro", "5", "kling-v1"), - "pro mode / 10s duration / kling-v1": ("pro", "10", "kling-v1"), "standard mode / 5s duration / kling-v1-6": ("std", "5", "kling-v1-6"), "standard mode / 10s duration / kling-v1-6": ("std", "10", "kling-v1-6"), "pro mode / 5s duration / kling-v2-master": ("pro", "5", "kling-v2-master"), @@ -129,8 +125,6 @@ See: [Kling API Docs Capability Map](https://app.klingai.com/global/dev/document MODE_START_END_FRAME = { - "standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"), - "pro mode / 5s duration / kling-v1": ("pro", "5", "kling-v1"), "pro mode / 5s duration / kling-v1-5": ("pro", "5", "kling-v1-5"), "pro mode / 10s duration / kling-v1-5": ("pro", "10", "kling-v1-5"), "pro mode / 5s duration / kling-v1-6": ("pro", "5", "kling-v1-6"), @@ -754,7 +748,7 @@ class KlingTextToVideoNode(IO.ComfyNode): IO.Combo.Input( "mode", options=modes, - default=modes[4], + default=modes[8], tooltip="The configuration to use for the video generation following the format: mode / duration / model_name.", ), ], @@ -1489,7 +1483,7 @@ class KlingStartEndFrameNode(IO.ComfyNode): IO.Combo.Input( "mode", options=modes, - default=modes[8], + default=modes[6], tooltip="The configuration to use for the video generation following the format: mode / duration / model_name.", ), ], @@ -1952,7 +1946,7 @@ class KlingImageGenerationNode(IO.ComfyNode): IO.Combo.Input( "model_name", options=[i.value for i in KlingImageGenModelName], - default="kling-v1", + default="kling-v2", ), IO.Combo.Input( "aspect_ratio", From 33c7f1179d4a961e4ca1dd78188c5134e0ee8e8c Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 16 Dec 2025 01:32:29 +0200 Subject: [PATCH 744/759] drop Pika API nodes (#11306) --- comfy_api_nodes/apis/pika_api.py | 100 ------ comfy_api_nodes/nodes_pika.py | 575 ------------------------------- nodes.py | 1 - 3 files changed, 676 deletions(-) delete mode 100644 comfy_api_nodes/apis/pika_api.py delete mode 100644 comfy_api_nodes/nodes_pika.py diff --git a/comfy_api_nodes/apis/pika_api.py b/comfy_api_nodes/apis/pika_api.py deleted file mode 100644 index 232558cd7..000000000 --- a/comfy_api_nodes/apis/pika_api.py +++ /dev/null @@ -1,100 +0,0 @@ -from typing import Optional -from enum import Enum -from pydantic import BaseModel, Field - - -class Pikaffect(str, Enum): - Cake_ify = "Cake-ify" - Crumble = "Crumble" - Crush = "Crush" - Decapitate = "Decapitate" - Deflate = "Deflate" - Dissolve = "Dissolve" - Explode = "Explode" - Eye_pop = "Eye-pop" - Inflate = "Inflate" - Levitate = "Levitate" - Melt = "Melt" - Peel = "Peel" - Poke = "Poke" - Squish = "Squish" - Ta_da = "Ta-da" - Tear = "Tear" - - -class PikaBodyGenerate22C2vGenerate22PikascenesPost(BaseModel): - aspectRatio: Optional[float] = Field(None, description='Aspect ratio (width / height)') - duration: Optional[int] = Field(5) - ingredientsMode: str = Field(...) - negativePrompt: Optional[str] = Field(None) - promptText: Optional[str] = Field(None) - resolution: Optional[str] = Field('1080p') - seed: Optional[int] = Field(None) - - -class PikaGenerateResponse(BaseModel): - video_id: str = Field(...) - - -class PikaBodyGenerate22I2vGenerate22I2vPost(BaseModel): - duration: Optional[int] = 5 - negativePrompt: Optional[str] = Field(None) - promptText: Optional[str] = Field(None) - resolution: Optional[str] = '1080p' - seed: Optional[int] = Field(None) - - -class PikaBodyGenerate22KeyframeGenerate22PikaframesPost(BaseModel): - duration: Optional[int] = Field(None, ge=5, le=10) - negativePrompt: Optional[str] = Field(None) - promptText: str = Field(...) - resolution: Optional[str] = '1080p' - seed: Optional[int] = Field(None) - - -class PikaBodyGenerate22T2vGenerate22T2vPost(BaseModel): - aspectRatio: Optional[float] = Field( - 1.7777777777777777, - description='Aspect ratio (width / height)', - ge=0.4, - le=2.5, - ) - duration: Optional[int] = 5 - negativePrompt: Optional[str] = Field(None) - promptText: str = Field(...) - resolution: Optional[str] = '1080p' - seed: Optional[int] = Field(None) - - -class PikaBodyGeneratePikadditionsGeneratePikadditionsPost(BaseModel): - negativePrompt: Optional[str] = Field(None) - promptText: Optional[str] = Field(None) - seed: Optional[int] = Field(None) - - -class PikaBodyGeneratePikaffectsGeneratePikaffectsPost(BaseModel): - negativePrompt: Optional[str] = Field(None) - pikaffect: Optional[str] = None - promptText: Optional[str] = Field(None) - seed: Optional[int] = Field(None) - - -class PikaBodyGeneratePikaswapsGeneratePikaswapsPost(BaseModel): - negativePrompt: Optional[str] = Field(None) - promptText: Optional[str] = Field(None) - seed: Optional[int] = Field(None) - modifyRegionRoi: Optional[str] = Field(None) - - -class PikaStatusEnum(str, Enum): - queued = "queued" - started = "started" - finished = "finished" - failed = "failed" - - -class PikaVideoResponse(BaseModel): - id: str = Field(...) - progress: Optional[int] = Field(None) - status: PikaStatusEnum - url: Optional[str] = Field(None) diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py deleted file mode 100644 index acd88c391..000000000 --- a/comfy_api_nodes/nodes_pika.py +++ /dev/null @@ -1,575 +0,0 @@ -""" -Pika x ComfyUI API Nodes - -Pika API docs: https://pika-827374fb.mintlify.app/api-reference -""" -from __future__ import annotations - -from io import BytesIO -import logging -from typing import Optional - -import torch - -from typing_extensions import override -from comfy_api.latest import ComfyExtension, IO -from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoInput -from comfy_api_nodes.apis import pika_api as pika_defs -from comfy_api_nodes.util import ( - validate_string, - download_url_to_video_output, - tensor_to_bytesio, - ApiEndpoint, - sync_op, - poll_op, -) - - -PATH_PIKADDITIONS = "/proxy/pika/generate/pikadditions" -PATH_PIKASWAPS = "/proxy/pika/generate/pikaswaps" -PATH_PIKAFFECTS = "/proxy/pika/generate/pikaffects" - -PIKA_API_VERSION = "2.2" -PATH_TEXT_TO_VIDEO = f"/proxy/pika/generate/{PIKA_API_VERSION}/t2v" -PATH_IMAGE_TO_VIDEO = f"/proxy/pika/generate/{PIKA_API_VERSION}/i2v" -PATH_PIKAFRAMES = f"/proxy/pika/generate/{PIKA_API_VERSION}/pikaframes" -PATH_PIKASCENES = f"/proxy/pika/generate/{PIKA_API_VERSION}/pikascenes" - -PATH_VIDEO_GET = "/proxy/pika/videos" - - -async def execute_task( - task_id: str, - cls: type[IO.ComfyNode], -) -> IO.NodeOutput: - final_response: pika_defs.PikaVideoResponse = await poll_op( - cls, - ApiEndpoint(path=f"{PATH_VIDEO_GET}/{task_id}"), - response_model=pika_defs.PikaVideoResponse, - status_extractor=lambda response: (response.status.value if response.status else None), - progress_extractor=lambda response: (response.progress if hasattr(response, "progress") else None), - estimated_duration=60, - max_poll_attempts=240, - ) - if not final_response.url: - error_msg = f"Pika task {task_id} succeeded but no video data found in response:\n{final_response}" - logging.error(error_msg) - raise Exception(error_msg) - video_url = final_response.url - logging.info("Pika task %s succeeded. Video URL: %s", task_id, video_url) - return IO.NodeOutput(await download_url_to_video_output(video_url)) - - -def get_base_inputs_types() -> list[IO.Input]: - """Get the base required inputs types common to all Pika nodes.""" - return [ - IO.String.Input("prompt_text", multiline=True), - IO.String.Input("negative_prompt", multiline=True), - IO.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), - IO.Combo.Input("resolution", options=["1080p", "720p"], default="1080p"), - IO.Combo.Input("duration", options=[5, 10], default=5), - ] - - -class PikaImageToVideo(IO.ComfyNode): - """Pika 2.2 Image to Video Node.""" - - @classmethod - def define_schema(cls) -> IO.Schema: - return IO.Schema( - node_id="PikaImageToVideoNode2_2", - display_name="Pika Image to Video", - description="Sends an image and prompt to the Pika API v2.2 to generate a video.", - category="api node/video/Pika", - inputs=[ - IO.Image.Input("image", tooltip="The image to convert to video"), - *get_base_inputs_types(), - ], - outputs=[IO.Video.Output()], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], - is_api_node=True, - is_deprecated=True, - ) - - @classmethod - async def execute( - cls, - image: torch.Tensor, - prompt_text: str, - negative_prompt: str, - seed: int, - resolution: str, - duration: int, - ) -> IO.NodeOutput: - image_bytes_io = tensor_to_bytesio(image) - pika_files = {"image": ("image.png", image_bytes_io, "image/png")} - pika_request_data = pika_defs.PikaBodyGenerate22I2vGenerate22I2vPost( - promptText=prompt_text, - negativePrompt=negative_prompt, - seed=seed, - resolution=resolution, - duration=duration, - ) - initial_operation = await sync_op( - cls, - ApiEndpoint(path=PATH_IMAGE_TO_VIDEO, method="POST"), - response_model=pika_defs.PikaGenerateResponse, - data=pika_request_data, - files=pika_files, - content_type="multipart/form-data", - ) - return await execute_task(initial_operation.video_id, cls) - - -class PikaTextToVideoNode(IO.ComfyNode): - """Pika Text2Video v2.2 Node.""" - - @classmethod - def define_schema(cls) -> IO.Schema: - return IO.Schema( - node_id="PikaTextToVideoNode2_2", - display_name="Pika Text to Video", - description="Sends a text prompt to the Pika API v2.2 to generate a video.", - category="api node/video/Pika", - inputs=[ - *get_base_inputs_types(), - IO.Float.Input( - "aspect_ratio", - step=0.001, - min=0.4, - max=2.5, - default=1.7777777777777777, - tooltip="Aspect ratio (width / height)", - ) - ], - outputs=[IO.Video.Output()], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], - is_api_node=True, - is_deprecated=True, - ) - - @classmethod - async def execute( - cls, - prompt_text: str, - negative_prompt: str, - seed: int, - resolution: str, - duration: int, - aspect_ratio: float, - ) -> IO.NodeOutput: - initial_operation = await sync_op( - cls, - ApiEndpoint(path=PATH_TEXT_TO_VIDEO, method="POST"), - response_model=pika_defs.PikaGenerateResponse, - data=pika_defs.PikaBodyGenerate22T2vGenerate22T2vPost( - promptText=prompt_text, - negativePrompt=negative_prompt, - seed=seed, - resolution=resolution, - duration=duration, - aspectRatio=aspect_ratio, - ), - content_type="application/x-www-form-urlencoded", - ) - return await execute_task(initial_operation.video_id, cls) - - -class PikaScenes(IO.ComfyNode): - """PikaScenes v2.2 Node.""" - - @classmethod - def define_schema(cls) -> IO.Schema: - return IO.Schema( - node_id="PikaScenesV2_2", - display_name="Pika Scenes (Video Image Composition)", - description="Combine your images to create a video with the objects in them. Upload multiple images as ingredients and generate a high-quality video that incorporates all of them.", - category="api node/video/Pika", - inputs=[ - *get_base_inputs_types(), - IO.Combo.Input( - "ingredients_mode", - options=["creative", "precise"], - default="creative", - ), - IO.Float.Input( - "aspect_ratio", - step=0.001, - min=0.4, - max=2.5, - default=1.7777777777777777, - tooltip="Aspect ratio (width / height)", - ), - IO.Image.Input( - "image_ingredient_1", - optional=True, - tooltip="Image that will be used as ingredient to create a video.", - ), - IO.Image.Input( - "image_ingredient_2", - optional=True, - tooltip="Image that will be used as ingredient to create a video.", - ), - IO.Image.Input( - "image_ingredient_3", - optional=True, - tooltip="Image that will be used as ingredient to create a video.", - ), - IO.Image.Input( - "image_ingredient_4", - optional=True, - tooltip="Image that will be used as ingredient to create a video.", - ), - IO.Image.Input( - "image_ingredient_5", - optional=True, - tooltip="Image that will be used as ingredient to create a video.", - ), - ], - outputs=[IO.Video.Output()], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], - is_api_node=True, - is_deprecated=True, - ) - - @classmethod - async def execute( - cls, - prompt_text: str, - negative_prompt: str, - seed: int, - resolution: str, - duration: int, - ingredients_mode: str, - aspect_ratio: float, - image_ingredient_1: Optional[torch.Tensor] = None, - image_ingredient_2: Optional[torch.Tensor] = None, - image_ingredient_3: Optional[torch.Tensor] = None, - image_ingredient_4: Optional[torch.Tensor] = None, - image_ingredient_5: Optional[torch.Tensor] = None, - ) -> IO.NodeOutput: - all_image_bytes_io = [] - for image in [ - image_ingredient_1, - image_ingredient_2, - image_ingredient_3, - image_ingredient_4, - image_ingredient_5, - ]: - if image is not None: - all_image_bytes_io.append(tensor_to_bytesio(image)) - - pika_files = [ - ("images", (f"image_{i}.png", image_bytes_io, "image/png")) - for i, image_bytes_io in enumerate(all_image_bytes_io) - ] - - pika_request_data = pika_defs.PikaBodyGenerate22C2vGenerate22PikascenesPost( - ingredientsMode=ingredients_mode, - promptText=prompt_text, - negativePrompt=negative_prompt, - seed=seed, - resolution=resolution, - duration=duration, - aspectRatio=aspect_ratio, - ) - initial_operation = await sync_op( - cls, - ApiEndpoint(path=PATH_PIKASCENES, method="POST"), - response_model=pika_defs.PikaGenerateResponse, - data=pika_request_data, - files=pika_files, - content_type="multipart/form-data", - ) - - return await execute_task(initial_operation.video_id, cls) - - -class PikAdditionsNode(IO.ComfyNode): - """Pika Pikadditions Node. Add an image into a video.""" - - @classmethod - def define_schema(cls) -> IO.Schema: - return IO.Schema( - node_id="Pikadditions", - display_name="Pikadditions (Video Object Insertion)", - description="Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result.", - category="api node/video/Pika", - inputs=[ - IO.Video.Input("video", tooltip="The video to add an image to."), - IO.Image.Input("image", tooltip="The image to add to the video."), - IO.String.Input("prompt_text", multiline=True), - IO.String.Input("negative_prompt", multiline=True), - IO.Int.Input( - "seed", - min=0, - max=0xFFFFFFFF, - control_after_generate=True, - ), - ], - outputs=[IO.Video.Output()], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], - is_api_node=True, - is_deprecated=True, - ) - - @classmethod - async def execute( - cls, - video: VideoInput, - image: torch.Tensor, - prompt_text: str, - negative_prompt: str, - seed: int, - ) -> IO.NodeOutput: - video_bytes_io = BytesIO() - video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264) - video_bytes_io.seek(0) - - image_bytes_io = tensor_to_bytesio(image) - pika_files = { - "video": ("video.mp4", video_bytes_io, "video/mp4"), - "image": ("image.png", image_bytes_io, "image/png"), - } - pika_request_data = pika_defs.PikaBodyGeneratePikadditionsGeneratePikadditionsPost( - promptText=prompt_text, - negativePrompt=negative_prompt, - seed=seed, - ) - initial_operation = await sync_op( - cls, - ApiEndpoint(path=PATH_PIKADDITIONS, method="POST"), - response_model=pika_defs.PikaGenerateResponse, - data=pika_request_data, - files=pika_files, - content_type="multipart/form-data", - ) - - return await execute_task(initial_operation.video_id, cls) - - -class PikaSwapsNode(IO.ComfyNode): - """Pika Pikaswaps Node.""" - - @classmethod - def define_schema(cls) -> IO.Schema: - return IO.Schema( - node_id="Pikaswaps", - display_name="Pika Swaps (Video Object Replacement)", - description="Swap out any object or region of your video with a new image or object. Define areas to replace either with a mask or coordinates.", - category="api node/video/Pika", - inputs=[ - IO.Video.Input("video", tooltip="The video to swap an object in."), - IO.Image.Input( - "image", - tooltip="The image used to replace the masked object in the video.", - optional=True, - ), - IO.Mask.Input( - "mask", - tooltip="Use the mask to define areas in the video to replace.", - optional=True, - ), - IO.String.Input("prompt_text", multiline=True, optional=True), - IO.String.Input("negative_prompt", multiline=True, optional=True), - IO.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True, optional=True), - IO.String.Input( - "region_to_modify", - multiline=True, - optional=True, - tooltip="Plaintext description of the object / region to modify.", - ), - ], - outputs=[IO.Video.Output()], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], - is_api_node=True, - is_deprecated=True, - ) - - @classmethod - async def execute( - cls, - video: VideoInput, - image: Optional[torch.Tensor] = None, - mask: Optional[torch.Tensor] = None, - prompt_text: str = "", - negative_prompt: str = "", - seed: int = 0, - region_to_modify: str = "", - ) -> IO.NodeOutput: - video_bytes_io = BytesIO() - video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264) - video_bytes_io.seek(0) - pika_files = { - "video": ("video.mp4", video_bytes_io, "video/mp4"), - } - if mask is not None: - pika_files["modifyRegionMask"] = ("mask.png", tensor_to_bytesio(mask), "image/png") - if image is not None: - pika_files["image"] = ("image.png", tensor_to_bytesio(image), "image/png") - - pika_request_data = pika_defs.PikaBodyGeneratePikaswapsGeneratePikaswapsPost( - promptText=prompt_text, - negativePrompt=negative_prompt, - seed=seed, - modifyRegionRoi=region_to_modify if region_to_modify else None, - ) - initial_operation = await sync_op( - cls, - ApiEndpoint(path=PATH_PIKASWAPS, method="POST"), - response_model=pika_defs.PikaGenerateResponse, - data=pika_request_data, - files=pika_files, - content_type="multipart/form-data", - ) - return await execute_task(initial_operation.video_id, cls) - - -class PikaffectsNode(IO.ComfyNode): - """Pika Pikaffects Node.""" - - @classmethod - def define_schema(cls) -> IO.Schema: - return IO.Schema( - node_id="Pikaffects", - display_name="Pikaffects (Video Effects)", - description="Generate a video with a specific Pikaffect. Supported Pikaffects: Cake-ify, Crumble, Crush, Decapitate, Deflate, Dissolve, Explode, Eye-pop, Inflate, Levitate, Melt, Peel, Poke, Squish, Ta-da, Tear", - category="api node/video/Pika", - inputs=[ - IO.Image.Input("image", tooltip="The reference image to apply the Pikaffect to."), - IO.Combo.Input( - "pikaffect", options=pika_defs.Pikaffect, default="Cake-ify" - ), - IO.String.Input("prompt_text", multiline=True), - IO.String.Input("negative_prompt", multiline=True), - IO.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), - ], - outputs=[IO.Video.Output()], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], - is_api_node=True, - is_deprecated=True, - ) - - @classmethod - async def execute( - cls, - image: torch.Tensor, - pikaffect: str, - prompt_text: str, - negative_prompt: str, - seed: int, - ) -> IO.NodeOutput: - initial_operation = await sync_op( - cls, - ApiEndpoint(path=PATH_PIKAFFECTS, method="POST"), - response_model=pika_defs.PikaGenerateResponse, - data=pika_defs.PikaBodyGeneratePikaffectsGeneratePikaffectsPost( - pikaffect=pikaffect, - promptText=prompt_text, - negativePrompt=negative_prompt, - seed=seed, - ), - files={"image": ("image.png", tensor_to_bytesio(image), "image/png")}, - content_type="multipart/form-data", - ) - return await execute_task(initial_operation.video_id, cls) - - -class PikaStartEndFrameNode(IO.ComfyNode): - """PikaFrames v2.2 Node.""" - - @classmethod - def define_schema(cls) -> IO.Schema: - return IO.Schema( - node_id="PikaStartEndFrameNode2_2", - display_name="Pika Start and End Frame to Video", - description="Generate a video by combining your first and last frame. Upload two images to define the start and end points, and let the AI create a smooth transition between them.", - category="api node/video/Pika", - inputs=[ - IO.Image.Input("image_start", tooltip="The first image to combine."), - IO.Image.Input("image_end", tooltip="The last image to combine."), - *get_base_inputs_types(), - ], - outputs=[IO.Video.Output()], - hidden=[ - IO.Hidden.auth_token_comfy_org, - IO.Hidden.api_key_comfy_org, - IO.Hidden.unique_id, - ], - is_api_node=True, - is_deprecated=True, - ) - - @classmethod - async def execute( - cls, - image_start: torch.Tensor, - image_end: torch.Tensor, - prompt_text: str, - negative_prompt: str, - seed: int, - resolution: str, - duration: int, - ) -> IO.NodeOutput: - validate_string(prompt_text, field_name="prompt_text", min_length=1) - pika_files = [ - ("keyFrames", ("image_start.png", tensor_to_bytesio(image_start), "image/png")), - ("keyFrames", ("image_end.png", tensor_to_bytesio(image_end), "image/png")), - ] - initial_operation = await sync_op( - cls, - ApiEndpoint(path=PATH_PIKAFRAMES, method="POST"), - response_model=pika_defs.PikaGenerateResponse, - data=pika_defs.PikaBodyGenerate22KeyframeGenerate22PikaframesPost( - promptText=prompt_text, - negativePrompt=negative_prompt, - seed=seed, - resolution=resolution, - duration=duration, - ), - files=pika_files, - content_type="multipart/form-data", - ) - return await execute_task(initial_operation.video_id, cls) - - -class PikaApiNodesExtension(ComfyExtension): - @override - async def get_node_list(self) -> list[type[IO.ComfyNode]]: - return [ - PikaImageToVideo, - PikaTextToVideoNode, - PikaScenes, - PikAdditionsNode, - PikaSwapsNode, - PikaffectsNode, - PikaStartEndFrameNode, - ] - - -async def comfy_entrypoint() -> PikaApiNodesExtension: - return PikaApiNodesExtension() diff --git a/nodes.py b/nodes.py index 8678f510a..3fa543294 100644 --- a/nodes.py +++ b/nodes.py @@ -2384,7 +2384,6 @@ async def init_builtin_api_nodes(): "nodes_recraft.py", "nodes_pixverse.py", "nodes_stability.py", - "nodes_pika.py", "nodes_runway.py", "nodes_sora.py", "nodes_topaz.py", From dbd330454ada04609c69fda2ae7c002d7ea05f67 Mon Sep 17 00:00:00 2001 From: "Dr.Lt.Data" <128333288+ltdrdata@users.noreply.github.com> Date: Tue, 16 Dec 2025 08:57:39 +0900 Subject: [PATCH 745/759] feat(preview): add per-queue live preview method override (#11261) - Add set_preview_method() to override live preview method per queue item - Read extra_data.preview_method from /prompt request - Support values: taesd, latent2rgb, none, auto, default - "default" or unset uses server's CLI --preview-method setting - Add 44 tests (37 unit + 7 E2E) --- comfy/cli_args.py | 7 + execution.py | 3 + latent_preview.py | 10 + .../preview_method_override_test.py | 352 +++++++++++++++++ tests/execution/test_preview_method.py | 358 ++++++++++++++++++ 5 files changed, 730 insertions(+) create mode 100644 tests-unit/execution_test/preview_method_override_test.py create mode 100644 tests/execution/test_preview_method.py diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 209fc185b..dae9a895d 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -97,6 +97,13 @@ class LatentPreviewMethod(enum.Enum): Latent2RGB = "latent2rgb" TAESD = "taesd" + @classmethod + def from_string(cls, value: str): + for member in cls: + if member.value == value: + return member + return None + parser.add_argument("--preview-method", type=LatentPreviewMethod, default=LatentPreviewMethod.NoPreviews, help="Default preview method for sampler nodes.", action=EnumAction) parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.") diff --git a/execution.py b/execution.py index c2186ac98..0c239efd7 100644 --- a/execution.py +++ b/execution.py @@ -13,6 +13,7 @@ import asyncio import torch import comfy.model_management +from latent_preview import set_preview_method import nodes from comfy_execution.caching import ( BasicCache, @@ -669,6 +670,8 @@ class PromptExecutor: asyncio.run(self.execute_async(prompt, prompt_id, extra_data, execute_outputs)) async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs=[]): + set_preview_method(extra_data.get("preview_method")) + nodes.interrupt_processing(False) if "client_id" in extra_data: diff --git a/latent_preview.py b/latent_preview.py index 66bded4b9..d52e3f7a1 100644 --- a/latent_preview.py +++ b/latent_preview.py @@ -8,6 +8,8 @@ import folder_paths import comfy.utils import logging +default_preview_method = args.preview_method + MAX_PREVIEW_RESOLUTION = args.preview_size VIDEO_TAES = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5"] @@ -125,3 +127,11 @@ def prepare_callback(model, steps, x0_output_dict=None): pbar.update_absolute(step + 1, total_steps, preview_bytes) return callback +def set_preview_method(override: str = None): + if override and override != "default": + method = LatentPreviewMethod.from_string(override) + if method is not None: + args.preview_method = method + return + args.preview_method = default_preview_method + diff --git a/tests-unit/execution_test/preview_method_override_test.py b/tests-unit/execution_test/preview_method_override_test.py new file mode 100644 index 000000000..79432d610 --- /dev/null +++ b/tests-unit/execution_test/preview_method_override_test.py @@ -0,0 +1,352 @@ +""" +Unit tests for Queue-specific Preview Method Override feature. + +Tests the preview method override functionality: +- LatentPreviewMethod.from_string() method +- set_preview_method() function in latent_preview.py +- default_preview_method variable +- Integration with args.preview_method +""" +import pytest +from comfy.cli_args import args, LatentPreviewMethod +from latent_preview import set_preview_method, default_preview_method + + +class TestLatentPreviewMethodFromString: + """Test LatentPreviewMethod.from_string() classmethod.""" + + @pytest.mark.parametrize("value,expected", [ + ("auto", LatentPreviewMethod.Auto), + ("latent2rgb", LatentPreviewMethod.Latent2RGB), + ("taesd", LatentPreviewMethod.TAESD), + ("none", LatentPreviewMethod.NoPreviews), + ]) + def test_valid_values_return_enum(self, value, expected): + """Valid string values should return corresponding enum.""" + assert LatentPreviewMethod.from_string(value) == expected + + @pytest.mark.parametrize("invalid", [ + "invalid", + "TAESD", # Case sensitive + "AUTO", # Case sensitive + "Latent2RGB", # Case sensitive + "latent", + "", + "default", # default is special, not a method + ]) + def test_invalid_values_return_none(self, invalid): + """Invalid string values should return None.""" + assert LatentPreviewMethod.from_string(invalid) is None + + +class TestLatentPreviewMethodEnumValues: + """Test LatentPreviewMethod enum has expected values.""" + + def test_enum_values(self): + """Verify enum values match expected strings.""" + assert LatentPreviewMethod.NoPreviews.value == "none" + assert LatentPreviewMethod.Auto.value == "auto" + assert LatentPreviewMethod.Latent2RGB.value == "latent2rgb" + assert LatentPreviewMethod.TAESD.value == "taesd" + + def test_enum_count(self): + """Verify exactly 4 preview methods exist.""" + assert len(LatentPreviewMethod) == 4 + + +class TestSetPreviewMethod: + """Test set_preview_method() function from latent_preview.py.""" + + def setup_method(self): + """Store original value before each test.""" + self.original = args.preview_method + + def teardown_method(self): + """Restore original value after each test.""" + args.preview_method = self.original + + def test_override_with_taesd(self): + """'taesd' should set args.preview_method to TAESD.""" + set_preview_method("taesd") + assert args.preview_method == LatentPreviewMethod.TAESD + + def test_override_with_latent2rgb(self): + """'latent2rgb' should set args.preview_method to Latent2RGB.""" + set_preview_method("latent2rgb") + assert args.preview_method == LatentPreviewMethod.Latent2RGB + + def test_override_with_auto(self): + """'auto' should set args.preview_method to Auto.""" + set_preview_method("auto") + assert args.preview_method == LatentPreviewMethod.Auto + + def test_override_with_none_value(self): + """'none' should set args.preview_method to NoPreviews.""" + set_preview_method("none") + assert args.preview_method == LatentPreviewMethod.NoPreviews + + def test_default_restores_original(self): + """'default' should restore to default_preview_method.""" + # First override to something else + set_preview_method("taesd") + assert args.preview_method == LatentPreviewMethod.TAESD + + # Then use 'default' to restore + set_preview_method("default") + assert args.preview_method == default_preview_method + + def test_none_param_restores_original(self): + """None parameter should restore to default_preview_method.""" + # First override to something else + set_preview_method("taesd") + assert args.preview_method == LatentPreviewMethod.TAESD + + # Then use None to restore + set_preview_method(None) + assert args.preview_method == default_preview_method + + def test_empty_string_restores_original(self): + """Empty string should restore to default_preview_method.""" + set_preview_method("taesd") + set_preview_method("") + assert args.preview_method == default_preview_method + + def test_invalid_value_restores_original(self): + """Invalid value should restore to default_preview_method.""" + set_preview_method("taesd") + set_preview_method("invalid_method") + assert args.preview_method == default_preview_method + + def test_case_sensitive_invalid_restores(self): + """Case-mismatched values should restore to default.""" + set_preview_method("taesd") + set_preview_method("TAESD") # Wrong case + assert args.preview_method == default_preview_method + + +class TestDefaultPreviewMethod: + """Test default_preview_method module variable.""" + + def test_default_is_not_none(self): + """default_preview_method should not be None.""" + assert default_preview_method is not None + + def test_default_is_enum_member(self): + """default_preview_method should be a LatentPreviewMethod enum.""" + assert isinstance(default_preview_method, LatentPreviewMethod) + + def test_default_matches_args_initial(self): + """default_preview_method should match CLI default or user setting.""" + # This tests that default_preview_method was captured at module load + # After set_preview_method(None), args should equal default + original = args.preview_method + set_preview_method("taesd") + set_preview_method(None) + assert args.preview_method == default_preview_method + args.preview_method = original + + +class TestArgsPreviewMethodModification: + """Test args.preview_method can be modified correctly.""" + + def setup_method(self): + """Store original value before each test.""" + self.original = args.preview_method + + def teardown_method(self): + """Restore original value after each test.""" + args.preview_method = self.original + + def test_args_accepts_all_enum_values(self): + """args.preview_method should accept all LatentPreviewMethod values.""" + for method in LatentPreviewMethod: + args.preview_method = method + assert args.preview_method == method + + def test_args_modification_and_restoration(self): + """args.preview_method should be modifiable and restorable.""" + original = args.preview_method + + args.preview_method = LatentPreviewMethod.TAESD + assert args.preview_method == LatentPreviewMethod.TAESD + + args.preview_method = original + assert args.preview_method == original + + +class TestExecutionFlow: + """Test the execution flow pattern used in execution.py.""" + + def setup_method(self): + """Store original value before each test.""" + self.original = args.preview_method + + def teardown_method(self): + """Restore original value after each test.""" + args.preview_method = self.original + + def test_sequential_executions_with_different_methods(self): + """Simulate multiple queue executions with different preview methods.""" + # Execution 1: taesd + set_preview_method("taesd") + assert args.preview_method == LatentPreviewMethod.TAESD + + # Execution 2: none + set_preview_method("none") + assert args.preview_method == LatentPreviewMethod.NoPreviews + + # Execution 3: default (restore) + set_preview_method("default") + assert args.preview_method == default_preview_method + + # Execution 4: auto + set_preview_method("auto") + assert args.preview_method == LatentPreviewMethod.Auto + + # Execution 5: no override (None) + set_preview_method(None) + assert args.preview_method == default_preview_method + + def test_override_then_default_pattern(self): + """Test the pattern: override -> execute -> next call restores.""" + # First execution with override + set_preview_method("latent2rgb") + assert args.preview_method == LatentPreviewMethod.Latent2RGB + + # Second execution without override restores default + set_preview_method(None) + assert args.preview_method == default_preview_method + + def test_extra_data_simulation(self): + """Simulate extra_data.get('preview_method') patterns.""" + # Simulate: extra_data = {"preview_method": "taesd"} + extra_data = {"preview_method": "taesd"} + set_preview_method(extra_data.get("preview_method")) + assert args.preview_method == LatentPreviewMethod.TAESD + + # Simulate: extra_data = {} + extra_data = {} + set_preview_method(extra_data.get("preview_method")) + assert args.preview_method == default_preview_method + + # Simulate: extra_data = {"preview_method": "default"} + extra_data = {"preview_method": "default"} + set_preview_method(extra_data.get("preview_method")) + assert args.preview_method == default_preview_method + + +class TestRealWorldScenarios: + """Tests using real-world prompt data patterns.""" + + def setup_method(self): + """Store original value before each test.""" + self.original = args.preview_method + + def teardown_method(self): + """Restore original value after each test.""" + args.preview_method = self.original + + def test_captured_prompt_without_preview_method(self): + """ + Test with captured prompt that has no preview_method. + Based on: tests-unit/execution_test/fixtures/default_prompt.json + """ + # Real captured extra_data structure (preview_method absent) + extra_data = { + "extra_pnginfo": {"workflow": {}}, + "client_id": "271314f0dabd48e5aaa488ed7a4ceb0d", + "create_time": 1765416558179 + } + + set_preview_method(extra_data.get("preview_method")) + assert args.preview_method == default_preview_method + + def test_captured_prompt_with_preview_method_taesd(self): + """Test captured prompt with preview_method: taesd.""" + extra_data = { + "extra_pnginfo": {"workflow": {}}, + "client_id": "271314f0dabd48e5aaa488ed7a4ceb0d", + "preview_method": "taesd" + } + + set_preview_method(extra_data.get("preview_method")) + assert args.preview_method == LatentPreviewMethod.TAESD + + def test_captured_prompt_with_preview_method_none(self): + """Test captured prompt with preview_method: none (disable preview).""" + extra_data = { + "extra_pnginfo": {"workflow": {}}, + "client_id": "test-client", + "preview_method": "none" + } + + set_preview_method(extra_data.get("preview_method")) + assert args.preview_method == LatentPreviewMethod.NoPreviews + + def test_captured_prompt_with_preview_method_latent2rgb(self): + """Test captured prompt with preview_method: latent2rgb.""" + extra_data = { + "extra_pnginfo": {"workflow": {}}, + "client_id": "test-client", + "preview_method": "latent2rgb" + } + + set_preview_method(extra_data.get("preview_method")) + assert args.preview_method == LatentPreviewMethod.Latent2RGB + + def test_captured_prompt_with_preview_method_auto(self): + """Test captured prompt with preview_method: auto.""" + extra_data = { + "extra_pnginfo": {"workflow": {}}, + "client_id": "test-client", + "preview_method": "auto" + } + + set_preview_method(extra_data.get("preview_method")) + assert args.preview_method == LatentPreviewMethod.Auto + + def test_captured_prompt_with_preview_method_default(self): + """Test captured prompt with preview_method: default (use CLI setting).""" + # First set to something else + set_preview_method("taesd") + assert args.preview_method == LatentPreviewMethod.TAESD + + # Then simulate a prompt with "default" + extra_data = { + "extra_pnginfo": {"workflow": {}}, + "client_id": "test-client", + "preview_method": "default" + } + + set_preview_method(extra_data.get("preview_method")) + assert args.preview_method == default_preview_method + + def test_sequential_queue_with_different_preview_methods(self): + """ + Simulate real queue scenario: multiple prompts with different settings. + This tests the actual usage pattern in ComfyUI. + """ + # Queue 1: User wants TAESD preview + extra_data_1 = {"client_id": "client-1", "preview_method": "taesd"} + set_preview_method(extra_data_1.get("preview_method")) + assert args.preview_method == LatentPreviewMethod.TAESD + + # Queue 2: User wants no preview (faster execution) + extra_data_2 = {"client_id": "client-2", "preview_method": "none"} + set_preview_method(extra_data_2.get("preview_method")) + assert args.preview_method == LatentPreviewMethod.NoPreviews + + # Queue 3: User doesn't specify (use server default) + extra_data_3 = {"client_id": "client-3"} + set_preview_method(extra_data_3.get("preview_method")) + assert args.preview_method == default_preview_method + + # Queue 4: User explicitly wants default + extra_data_4 = {"client_id": "client-4", "preview_method": "default"} + set_preview_method(extra_data_4.get("preview_method")) + assert args.preview_method == default_preview_method + + # Queue 5: User wants latent2rgb + extra_data_5 = {"client_id": "client-5", "preview_method": "latent2rgb"} + set_preview_method(extra_data_5.get("preview_method")) + assert args.preview_method == LatentPreviewMethod.Latent2RGB diff --git a/tests/execution/test_preview_method.py b/tests/execution/test_preview_method.py new file mode 100644 index 000000000..c3037553b --- /dev/null +++ b/tests/execution/test_preview_method.py @@ -0,0 +1,358 @@ +""" +E2E tests for Queue-specific Preview Method Override feature. + +Tests actual execution with different preview_method values. +Requires a running ComfyUI server with models. + +Usage: + COMFYUI_SERVER=http://localhost:8988 pytest test_preview_method_e2e.py -v -m preview_method + +Note: + These tests execute actual image generation and wait for completion. + Tests verify preview image transmission based on preview_method setting. +""" +import os +import json +import pytest +import uuid +import time +import random +import websocket +import urllib.request +from pathlib import Path + + +# Server configuration +SERVER_URL = os.environ.get("COMFYUI_SERVER", "http://localhost:8988") +SERVER_HOST = SERVER_URL.replace("http://", "").replace("https://", "") + +# Use existing inference graph fixture +GRAPH_FILE = Path(__file__).parent.parent / "inference" / "graphs" / "default_graph_sdxl1_0.json" + + +def is_server_running() -> bool: + """Check if ComfyUI server is running.""" + try: + request = urllib.request.Request(f"{SERVER_URL}/system_stats") + with urllib.request.urlopen(request, timeout=2.0): + return True + except Exception: + return False + + +def prepare_graph_for_test(graph: dict, steps: int = 5) -> dict: + """Prepare graph for testing: randomize seeds and reduce steps.""" + adapted = json.loads(json.dumps(graph)) # Deep copy + for node_id, node in adapted.items(): + inputs = node.get("inputs", {}) + # Handle both "seed" and "noise_seed" (used by KSamplerAdvanced) + if "seed" in inputs: + inputs["seed"] = random.randint(0, 2**32 - 1) + if "noise_seed" in inputs: + inputs["noise_seed"] = random.randint(0, 2**32 - 1) + # Reduce steps for faster testing (default 20 -> 5) + if "steps" in inputs: + inputs["steps"] = steps + return adapted + + +# Alias for backward compatibility +randomize_seed = prepare_graph_for_test + + +class PreviewMethodClient: + """Client for testing preview_method with WebSocket execution tracking.""" + + def __init__(self, server_address: str): + self.server_address = server_address + self.client_id = str(uuid.uuid4()) + self.ws = None + + def connect(self): + """Connect to WebSocket.""" + self.ws = websocket.WebSocket() + self.ws.settimeout(120) # 2 minute timeout for sampling + self.ws.connect(f"ws://{self.server_address}/ws?clientId={self.client_id}") + + def close(self): + """Close WebSocket connection.""" + if self.ws: + self.ws.close() + + def queue_prompt(self, prompt: dict, extra_data: dict = None) -> dict: + """Queue a prompt and return response with prompt_id.""" + data = { + "prompt": prompt, + "client_id": self.client_id, + "extra_data": extra_data or {} + } + req = urllib.request.Request( + f"http://{self.server_address}/prompt", + data=json.dumps(data).encode("utf-8"), + headers={"Content-Type": "application/json"} + ) + return json.loads(urllib.request.urlopen(req).read()) + + def wait_for_execution(self, prompt_id: str, timeout: float = 120.0) -> dict: + """ + Wait for execution to complete via WebSocket. + + Returns: + dict with keys: completed, error, preview_count, execution_time + """ + result = { + "completed": False, + "error": None, + "preview_count": 0, + "execution_time": 0.0 + } + + start_time = time.time() + self.ws.settimeout(timeout) + + try: + while True: + out = self.ws.recv() + elapsed = time.time() - start_time + + if isinstance(out, str): + message = json.loads(out) + msg_type = message.get("type") + data = message.get("data", {}) + + if data.get("prompt_id") != prompt_id: + continue + + if msg_type == "executing": + if data.get("node") is None: + # Execution complete + result["completed"] = True + result["execution_time"] = elapsed + break + + elif msg_type == "execution_error": + result["error"] = data + result["execution_time"] = elapsed + break + + elif msg_type == "progress": + # Progress update during sampling + pass + + elif isinstance(out, bytes): + # Binary data = preview image + result["preview_count"] += 1 + + except websocket.WebSocketTimeoutException: + result["error"] = "Timeout waiting for execution" + result["execution_time"] = time.time() - start_time + + return result + + +def load_graph() -> dict: + """Load the SDXL graph fixture with randomized seed.""" + with open(GRAPH_FILE) as f: + graph = json.load(f) + return randomize_seed(graph) # Avoid caching + + +# Skip all tests if server is not running +pytestmark = [ + pytest.mark.skipif( + not is_server_running(), + reason=f"ComfyUI server not running at {SERVER_URL}" + ), + pytest.mark.preview_method, + pytest.mark.execution, +] + + +@pytest.fixture +def client(): + """Create and connect a test client.""" + c = PreviewMethodClient(SERVER_HOST) + c.connect() + yield c + c.close() + + +@pytest.fixture +def graph(): + """Load the test graph.""" + return load_graph() + + +class TestPreviewMethodExecution: + """Test actual execution with different preview methods.""" + + def test_execution_with_latent2rgb(self, client, graph): + """ + Execute with preview_method=latent2rgb. + Should complete and potentially receive preview images. + """ + extra_data = {"preview_method": "latent2rgb"} + + response = client.queue_prompt(graph, extra_data) + assert "prompt_id" in response + + result = client.wait_for_execution(response["prompt_id"]) + + # Should complete (may error if model missing, but that's separate) + assert result["completed"] or result["error"] is not None + # Execution should take some time (sampling) + if result["completed"]: + assert result["execution_time"] > 0.5, "Execution too fast - likely didn't run" + # latent2rgb should produce previews + print(f"latent2rgb: {result['preview_count']} previews in {result['execution_time']:.2f}s") # noqa: T201 + + def test_execution_with_taesd(self, client, graph): + """ + Execute with preview_method=taesd. + TAESD provides higher quality previews. + """ + extra_data = {"preview_method": "taesd"} + + response = client.queue_prompt(graph, extra_data) + assert "prompt_id" in response + + result = client.wait_for_execution(response["prompt_id"]) + + assert result["completed"] or result["error"] is not None + if result["completed"]: + assert result["execution_time"] > 0.5 + # taesd should also produce previews + print(f"taesd: {result['preview_count']} previews in {result['execution_time']:.2f}s") # noqa: T201 + + def test_execution_with_none_preview(self, client, graph): + """ + Execute with preview_method=none. + No preview images should be generated. + """ + extra_data = {"preview_method": "none"} + + response = client.queue_prompt(graph, extra_data) + assert "prompt_id" in response + + result = client.wait_for_execution(response["prompt_id"]) + + assert result["completed"] or result["error"] is not None + if result["completed"]: + # With "none", should receive no preview images + assert result["preview_count"] == 0, \ + f"Expected no previews with 'none', got {result['preview_count']}" + print(f"none: {result['preview_count']} previews in {result['execution_time']:.2f}s") # noqa: T201 + + def test_execution_with_default(self, client, graph): + """ + Execute with preview_method=default. + Should use server's CLI default setting. + """ + extra_data = {"preview_method": "default"} + + response = client.queue_prompt(graph, extra_data) + assert "prompt_id" in response + + result = client.wait_for_execution(response["prompt_id"]) + + assert result["completed"] or result["error"] is not None + if result["completed"]: + print(f"default: {result['preview_count']} previews in {result['execution_time']:.2f}s") # noqa: T201 + + def test_execution_without_preview_method(self, client, graph): + """ + Execute without preview_method in extra_data. + Should use server's default preview method. + """ + extra_data = {} # No preview_method + + response = client.queue_prompt(graph, extra_data) + assert "prompt_id" in response + + result = client.wait_for_execution(response["prompt_id"]) + + assert result["completed"] or result["error"] is not None + if result["completed"]: + print(f"(no override): {result['preview_count']} previews in {result['execution_time']:.2f}s") # noqa: T201 + + +class TestPreviewMethodComparison: + """Compare preview behavior between different methods.""" + + def test_none_vs_latent2rgb_preview_count(self, client, graph): + """ + Compare preview counts: 'none' should have 0, others should have >0. + This is the key verification that preview_method actually works. + """ + results = {} + + # Run with none (randomize seed to avoid caching) + graph_none = randomize_seed(graph) + extra_data_none = {"preview_method": "none"} + response = client.queue_prompt(graph_none, extra_data_none) + results["none"] = client.wait_for_execution(response["prompt_id"]) + + # Run with latent2rgb (randomize seed again) + graph_rgb = randomize_seed(graph) + extra_data_rgb = {"preview_method": "latent2rgb"} + response = client.queue_prompt(graph_rgb, extra_data_rgb) + results["latent2rgb"] = client.wait_for_execution(response["prompt_id"]) + + # Verify both completed + assert results["none"]["completed"], f"'none' execution failed: {results['none']['error']}" + assert results["latent2rgb"]["completed"], f"'latent2rgb' execution failed: {results['latent2rgb']['error']}" + + # Key assertion: 'none' should have 0 previews + assert results["none"]["preview_count"] == 0, \ + f"'none' should have 0 previews, got {results['none']['preview_count']}" + + # 'latent2rgb' should have at least 1 preview (depends on steps) + assert results["latent2rgb"]["preview_count"] > 0, \ + f"'latent2rgb' should have >0 previews, got {results['latent2rgb']['preview_count']}" + + print("\nPreview count comparison:") # noqa: T201 + print(f" none: {results['none']['preview_count']} previews") # noqa: T201 + print(f" latent2rgb: {results['latent2rgb']['preview_count']} previews") # noqa: T201 + + +class TestPreviewMethodSequential: + """Test sequential execution with different preview methods.""" + + def test_sequential_different_methods(self, client, graph): + """ + Execute multiple prompts sequentially with different preview methods. + Each should complete independently with correct preview behavior. + """ + methods = ["latent2rgb", "none", "default"] + results = [] + + for method in methods: + # Randomize seed for each execution to avoid caching + graph_run = randomize_seed(graph) + extra_data = {"preview_method": method} + response = client.queue_prompt(graph_run, extra_data) + + result = client.wait_for_execution(response["prompt_id"]) + results.append({ + "method": method, + "completed": result["completed"], + "preview_count": result["preview_count"], + "execution_time": result["execution_time"], + "error": result["error"] + }) + + # All should complete or have clear errors + for r in results: + assert r["completed"] or r["error"] is not None, \ + f"Method {r['method']} neither completed nor errored" + + # "none" should have zero previews if completed + none_result = next(r for r in results if r["method"] == "none") + if none_result["completed"]: + assert none_result["preview_count"] == 0, \ + f"'none' should have 0 previews, got {none_result['preview_count']}" + + print("\nSequential execution results:") # noqa: T201 + for r in results: + status = "✓" if r["completed"] else f"✗ ({r['error']})" + print(f" {r['method']}: {status}, {r['preview_count']} previews, {r['execution_time']:.2f}s") # noqa: T201 From 43e0d4e3ccfe8b5eac81bcee6f912f661849aafb Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 16 Dec 2025 02:01:10 +0200 Subject: [PATCH 746/759] comfy_api: remove usage of "Type","List" and "Dict" types (#11238) --- comfy_api/feature_flags.py | 10 +++++----- comfy_api/internal/api_registry.py | 10 +++++----- comfy_api/internal/async_to_sync.py | 14 ++++++------- comfy_api/internal/singleton.py | 6 +++--- comfy_api/latest/__init__.py | 4 ++-- comfy_api/latest/_input/basic_types.py | 4 ++-- comfy_api/latest/_ui.py | 27 +++++++++++++------------- comfy_api/version_list.py | 3 +-- 8 files changed, 38 insertions(+), 40 deletions(-) diff --git a/comfy_api/feature_flags.py b/comfy_api/feature_flags.py index bfb77eb5f..de167f037 100644 --- a/comfy_api/feature_flags.py +++ b/comfy_api/feature_flags.py @@ -5,12 +5,12 @@ This module handles capability negotiation between frontend and backend, allowing graceful protocol evolution while maintaining backward compatibility. """ -from typing import Any, Dict +from typing import Any from comfy.cli_args import args # Default server capabilities -SERVER_FEATURE_FLAGS: Dict[str, Any] = { +SERVER_FEATURE_FLAGS: dict[str, Any] = { "supports_preview_metadata": True, "max_upload_size": args.max_upload_size * 1024 * 1024, # Convert MB to bytes "extension": {"manager": {"supports_v4": True}}, @@ -18,7 +18,7 @@ SERVER_FEATURE_FLAGS: Dict[str, Any] = { def get_connection_feature( - sockets_metadata: Dict[str, Dict[str, Any]], + sockets_metadata: dict[str, dict[str, Any]], sid: str, feature_name: str, default: Any = False @@ -42,7 +42,7 @@ def get_connection_feature( def supports_feature( - sockets_metadata: Dict[str, Dict[str, Any]], + sockets_metadata: dict[str, dict[str, Any]], sid: str, feature_name: str ) -> bool: @@ -60,7 +60,7 @@ def supports_feature( return get_connection_feature(sockets_metadata, sid, feature_name, False) is True -def get_server_features() -> Dict[str, Any]: +def get_server_features() -> dict[str, Any]: """ Get the server's feature flags. diff --git a/comfy_api/internal/api_registry.py b/comfy_api/internal/api_registry.py index 7e3375cf6..2b1cb016a 100644 --- a/comfy_api/internal/api_registry.py +++ b/comfy_api/internal/api_registry.py @@ -1,4 +1,4 @@ -from typing import Type, List, NamedTuple +from typing import NamedTuple from comfy_api.internal.singleton import ProxiedSingleton from packaging import version as packaging_version @@ -10,7 +10,7 @@ class ComfyAPIBase(ProxiedSingleton): class ComfyAPIWithVersion(NamedTuple): version: str - api_class: Type[ComfyAPIBase] + api_class: type[ComfyAPIBase] def parse_version(version_str: str) -> packaging_version.Version: @@ -23,16 +23,16 @@ def parse_version(version_str: str) -> packaging_version.Version: return packaging_version.parse(version_str) -registered_versions: List[ComfyAPIWithVersion] = [] +registered_versions: list[ComfyAPIWithVersion] = [] -def register_versions(versions: List[ComfyAPIWithVersion]): +def register_versions(versions: list[ComfyAPIWithVersion]): versions.sort(key=lambda x: parse_version(x.version)) global registered_versions registered_versions = versions -def get_all_versions() -> List[ComfyAPIWithVersion]: +def get_all_versions() -> list[ComfyAPIWithVersion]: """ Returns a list of all registered ComfyAPI versions. """ diff --git a/comfy_api/internal/async_to_sync.py b/comfy_api/internal/async_to_sync.py index 257ade82e..c9b0576e1 100644 --- a/comfy_api/internal/async_to_sync.py +++ b/comfy_api/internal/async_to_sync.py @@ -8,7 +8,7 @@ import os import textwrap import threading from enum import Enum -from typing import Optional, Type, get_origin, get_args, get_type_hints +from typing import Optional, get_origin, get_args, get_type_hints class TypeTracker: @@ -193,7 +193,7 @@ class AsyncToSyncConverter: return result_container["result"] @classmethod - def create_sync_class(cls, async_class: Type, thread_pool_size=10) -> Type: + def create_sync_class(cls, async_class: type, thread_pool_size=10) -> type: """ Creates a new class with synchronous versions of all async methods. @@ -563,7 +563,7 @@ class AsyncToSyncConverter: @classmethod def _generate_imports( - cls, async_class: Type, type_tracker: TypeTracker + cls, async_class: type, type_tracker: TypeTracker ) -> list[str]: """Generate import statements for the stub file.""" imports = [] @@ -628,7 +628,7 @@ class AsyncToSyncConverter: return imports @classmethod - def _get_class_attributes(cls, async_class: Type) -> list[tuple[str, Type]]: + def _get_class_attributes(cls, async_class: type) -> list[tuple[str, type]]: """Extract class attributes that are classes themselves.""" class_attributes = [] @@ -654,7 +654,7 @@ class AsyncToSyncConverter: def _generate_inner_class_stub( cls, name: str, - attr: Type, + attr: type, indent: str = " ", type_tracker: Optional[TypeTracker] = None, ) -> list[str]: @@ -782,7 +782,7 @@ class AsyncToSyncConverter: return processed @classmethod - def generate_stub_file(cls, async_class: Type, sync_class: Type) -> None: + def generate_stub_file(cls, async_class: type, sync_class: type) -> None: """ Generate a .pyi stub file for the sync class to help IDEs with type checking. """ @@ -988,7 +988,7 @@ class AsyncToSyncConverter: logging.error(traceback.format_exc()) -def create_sync_class(async_class: Type, thread_pool_size=10) -> Type: +def create_sync_class(async_class: type, thread_pool_size=10) -> type: """ Creates a sync version of an async class diff --git a/comfy_api/internal/singleton.py b/comfy_api/internal/singleton.py index 75f16f98e..d89380262 100644 --- a/comfy_api/internal/singleton.py +++ b/comfy_api/internal/singleton.py @@ -1,4 +1,4 @@ -from typing import Type, TypeVar +from typing import TypeVar class SingletonMetaclass(type): T = TypeVar("T", bound="SingletonMetaclass") @@ -11,13 +11,13 @@ class SingletonMetaclass(type): ) return cls._instances[cls] - def inject_instance(cls: Type[T], instance: T) -> None: + def inject_instance(cls: type[T], instance: T) -> None: assert cls not in SingletonMetaclass._instances, ( "Cannot inject instance after first instantiation" ) SingletonMetaclass._instances[cls] = instance - def get_instance(cls: Type[T], *args, **kwargs) -> T: + def get_instance(cls: type[T], *args, **kwargs) -> T: """ Gets the singleton instance of the class, creating it if it doesn't exist. """ diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py index 35e1ac853..fab63c7df 100644 --- a/comfy_api/latest/__init__.py +++ b/comfy_api/latest/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Type, TYPE_CHECKING +from typing import TYPE_CHECKING from comfy_api.internal import ComfyAPIBase from comfy_api.internal.singleton import ProxiedSingleton from comfy_api.internal.async_to_sync import create_sync_class @@ -113,7 +113,7 @@ ComfyAPI = ComfyAPI_latest if TYPE_CHECKING: import comfy_api.latest.generated.ComfyAPISyncStub # type: ignore - ComfyAPISync: Type[comfy_api.latest.generated.ComfyAPISyncStub.ComfyAPISyncStub] + ComfyAPISync: type[comfy_api.latest.generated.ComfyAPISyncStub.ComfyAPISyncStub] ComfyAPISync = create_sync_class(ComfyAPI_latest) # create new aliases for io and ui diff --git a/comfy_api/latest/_input/basic_types.py b/comfy_api/latest/_input/basic_types.py index 245c6cbb1..d73deabd2 100644 --- a/comfy_api/latest/_input/basic_types.py +++ b/comfy_api/latest/_input/basic_types.py @@ -1,5 +1,5 @@ import torch -from typing import TypedDict, List, Optional +from typing import TypedDict, Optional ImageInput = torch.Tensor """ @@ -39,4 +39,4 @@ class LatentInput(TypedDict): Optional noise mask tensor in the same format as samples. """ - batch_index: Optional[List[int]] + batch_index: Optional[list[int]] diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py index 2babe209a..e238cdf3c 100644 --- a/comfy_api/latest/_ui.py +++ b/comfy_api/latest/_ui.py @@ -5,7 +5,6 @@ import os import random import uuid from io import BytesIO -from typing import Type import av import numpy as np @@ -83,7 +82,7 @@ class ImageSaveHelper: return PILImage.fromarray(np.clip(255.0 * image_tensor.cpu().numpy(), 0, 255).astype(np.uint8)) @staticmethod - def _create_png_metadata(cls: Type[ComfyNode] | None) -> PngInfo | None: + def _create_png_metadata(cls: type[ComfyNode] | None) -> PngInfo | None: """Creates a PngInfo object with prompt and extra_pnginfo.""" if args.disable_metadata or cls is None or not cls.hidden: return None @@ -96,7 +95,7 @@ class ImageSaveHelper: return metadata @staticmethod - def _create_animated_png_metadata(cls: Type[ComfyNode] | None) -> PngInfo | None: + def _create_animated_png_metadata(cls: type[ComfyNode] | None) -> PngInfo | None: """Creates a PngInfo object with prompt and extra_pnginfo for animated PNGs (APNG).""" if args.disable_metadata or cls is None or not cls.hidden: return None @@ -121,7 +120,7 @@ class ImageSaveHelper: return metadata @staticmethod - def _create_webp_metadata(pil_image: PILImage.Image, cls: Type[ComfyNode] | None) -> PILImage.Exif: + def _create_webp_metadata(pil_image: PILImage.Image, cls: type[ComfyNode] | None) -> PILImage.Exif: """Creates EXIF metadata bytes for WebP images.""" exif_data = pil_image.getexif() if args.disable_metadata or cls is None or cls.hidden is None: @@ -137,7 +136,7 @@ class ImageSaveHelper: @staticmethod def save_images( - images, filename_prefix: str, folder_type: FolderType, cls: Type[ComfyNode] | None, compress_level = 4, + images, filename_prefix: str, folder_type: FolderType, cls: type[ComfyNode] | None, compress_level = 4, ) -> list[SavedResult]: """Saves a batch of images as individual PNG files.""" full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path( @@ -155,7 +154,7 @@ class ImageSaveHelper: return results @staticmethod - def get_save_images_ui(images, filename_prefix: str, cls: Type[ComfyNode] | None, compress_level=4) -> SavedImages: + def get_save_images_ui(images, filename_prefix: str, cls: type[ComfyNode] | None, compress_level=4) -> SavedImages: """Saves a batch of images and returns a UI object for the node output.""" return SavedImages( ImageSaveHelper.save_images( @@ -169,7 +168,7 @@ class ImageSaveHelper: @staticmethod def save_animated_png( - images, filename_prefix: str, folder_type: FolderType, cls: Type[ComfyNode] | None, fps: float, compress_level: int + images, filename_prefix: str, folder_type: FolderType, cls: type[ComfyNode] | None, fps: float, compress_level: int ) -> SavedResult: """Saves a batch of images as a single animated PNG.""" full_output_folder, filename, counter, subfolder, _ = folder_paths.get_save_image_path( @@ -191,7 +190,7 @@ class ImageSaveHelper: @staticmethod def get_save_animated_png_ui( - images, filename_prefix: str, cls: Type[ComfyNode] | None, fps: float, compress_level: int + images, filename_prefix: str, cls: type[ComfyNode] | None, fps: float, compress_level: int ) -> SavedImages: """Saves an animated PNG and returns a UI object for the node output.""" result = ImageSaveHelper.save_animated_png( @@ -209,7 +208,7 @@ class ImageSaveHelper: images, filename_prefix: str, folder_type: FolderType, - cls: Type[ComfyNode] | None, + cls: type[ComfyNode] | None, fps: float, lossless: bool, quality: int, @@ -238,7 +237,7 @@ class ImageSaveHelper: def get_save_animated_webp_ui( images, filename_prefix: str, - cls: Type[ComfyNode] | None, + cls: type[ComfyNode] | None, fps: float, lossless: bool, quality: int, @@ -267,7 +266,7 @@ class AudioSaveHelper: audio: dict, filename_prefix: str, folder_type: FolderType, - cls: Type[ComfyNode] | None, + cls: type[ComfyNode] | None, format: str = "flac", quality: str = "128k", ) -> list[SavedResult]: @@ -372,7 +371,7 @@ class AudioSaveHelper: @staticmethod def get_save_audio_ui( - audio, filename_prefix: str, cls: Type[ComfyNode] | None, format: str = "flac", quality: str = "128k", + audio, filename_prefix: str, cls: type[ComfyNode] | None, format: str = "flac", quality: str = "128k", ) -> SavedAudios: """Save and instantly wrap for UI.""" return SavedAudios( @@ -388,7 +387,7 @@ class AudioSaveHelper: class PreviewImage(_UIOutput): - def __init__(self, image: Image.Type, animated: bool = False, cls: Type[ComfyNode] = None, **kwargs): + def __init__(self, image: Image.Type, animated: bool = False, cls: type[ComfyNode] = None, **kwargs): self.values = ImageSaveHelper.save_images( image, filename_prefix="ComfyUI_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for _ in range(5)), @@ -412,7 +411,7 @@ class PreviewMask(PreviewImage): class PreviewAudio(_UIOutput): - def __init__(self, audio: dict, cls: Type[ComfyNode] = None, **kwargs): + def __init__(self, audio: dict, cls: type[ComfyNode] = None, **kwargs): self.values = AudioSaveHelper.save_audio( audio, filename_prefix="ComfyUI_temp_" + "".join(random.choice("abcdefghijklmnopqrstuvwxyz") for _ in range(5)), diff --git a/comfy_api/version_list.py b/comfy_api/version_list.py index 7cb1871d5..be6e1db66 100644 --- a/comfy_api/version_list.py +++ b/comfy_api/version_list.py @@ -2,9 +2,8 @@ from comfy_api.latest import ComfyAPI_latest from comfy_api.v0_0_2 import ComfyAPIAdapter_v0_0_2 from comfy_api.v0_0_1 import ComfyAPIAdapter_v0_0_1 from comfy_api.internal import ComfyAPIBase -from typing import List, Type -supported_versions: List[Type[ComfyAPIBase]] = [ +supported_versions: list[type[ComfyAPIBase]] = [ ComfyAPI_latest, ComfyAPIAdapter_v0_0_2, ComfyAPIAdapter_v0_0_1, From 77b2f7c228a0db6643bb7f29be4db0bff6799db2 Mon Sep 17 00:00:00 2001 From: drozbay <17261091+drozbay@users.noreply.github.com> Date: Mon, 15 Dec 2025 17:06:32 -0700 Subject: [PATCH 747/759] Add context windows callback for custom cond handling (#11208) Co-authored-by: ozbayb <17261091+ozbayb@users.noreply.github.com> --- comfy/context_windows.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/comfy/context_windows.py b/comfy/context_windows.py index 5c412d1c2..2979b3ca1 100644 --- a/comfy/context_windows.py +++ b/comfy/context_windows.py @@ -87,6 +87,7 @@ class IndexListCallbacks: COMBINE_CONTEXT_WINDOW_RESULTS = "combine_context_window_results" EXECUTE_START = "execute_start" EXECUTE_CLEANUP = "execute_cleanup" + RESIZE_COND_ITEM = "resize_cond_item" def init_callbacks(self): return {} @@ -166,6 +167,18 @@ class IndexListContextHandler(ContextHandlerABC): new_cond_item = cond_item.copy() # when in dictionary, look for tensors and CONDCrossAttn [comfy/conds.py] (has cond attr that is a tensor) for cond_key, cond_value in new_cond_item.items(): + # Allow callbacks to handle custom conditioning items + handled = False + for callback in comfy.patcher_extension.get_all_callbacks( + IndexListCallbacks.RESIZE_COND_ITEM, self.callbacks + ): + result = callback(cond_key, cond_value, window, x_in, device, new_cond_item) + if result is not None: + new_cond_item[cond_key] = result + handled = True + break + if handled: + continue if isinstance(cond_value, torch.Tensor): if (self.dim < cond_value.ndim and cond_value(self.dim) == x_in.size(self.dim)) or \ (cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim)): From 70541d4e7769c6c40eae6594e677355eacd181fe Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 15 Dec 2025 16:20:34 -0800 Subject: [PATCH 748/759] Support the new qwen edit 2511 reference method. (#11340) index_timestep_zero can be selected in the FluxKontextMultiReferenceLatentMethod now with the display name set to the more generic "Edit Model Reference Method" node. --- comfy/ldm/qwen_image/model.py | 47 +++++++++++++++++++++++++++++------ comfy_extras/nodes_flux.py | 3 ++- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 8c75670cd..96590088b 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -218,9 +218,24 @@ class QwenImageTransformerBlock(nn.Module): operations=operations, ) - def _modulate(self, x: torch.Tensor, mod_params: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + def _apply_gate(self, x, y, gate, timestep_zero_index=None): + if timestep_zero_index is not None: + return y + torch.cat((x[:, :timestep_zero_index] * gate[0], x[:, timestep_zero_index:] * gate[1]), dim=1) + else: + return torch.addcmul(y, gate, x) + + def _modulate(self, x: torch.Tensor, mod_params: torch.Tensor, timestep_zero_index=None) -> Tuple[torch.Tensor, torch.Tensor]: shift, scale, gate = torch.chunk(mod_params, 3, dim=-1) - return torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1)), gate.unsqueeze(1) + if timestep_zero_index is not None: + actual_batch = shift.size(0) // 2 + shift, shift_0 = shift[:actual_batch], shift[actual_batch:] + scale, scale_0 = scale[:actual_batch], scale[actual_batch:] + gate, gate_0 = gate[:actual_batch], gate[actual_batch:] + reg = torch.addcmul(shift.unsqueeze(1), x[:, :timestep_zero_index], 1 + scale.unsqueeze(1)) + zero = torch.addcmul(shift_0.unsqueeze(1), x[:, timestep_zero_index:], 1 + scale_0.unsqueeze(1)) + return torch.cat((reg, zero), dim=1), (gate.unsqueeze(1), gate_0.unsqueeze(1)) + else: + return torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1)), gate.unsqueeze(1) def forward( self, @@ -229,14 +244,19 @@ class QwenImageTransformerBlock(nn.Module): encoder_hidden_states_mask: torch.Tensor, temb: torch.Tensor, image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + timestep_zero_index=None, transformer_options={}, ) -> Tuple[torch.Tensor, torch.Tensor]: img_mod_params = self.img_mod(temb) + + if timestep_zero_index is not None: + temb = temb.chunk(2, dim=0)[0] + txt_mod_params = self.txt_mod(temb) img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1) txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1) - img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1) + img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1, timestep_zero_index) del img_mod1 txt_modulated, txt_gate1 = self._modulate(self.txt_norm1(encoder_hidden_states), txt_mod1) del txt_mod1 @@ -251,15 +271,15 @@ class QwenImageTransformerBlock(nn.Module): del img_modulated del txt_modulated - hidden_states = hidden_states + img_gate1 * img_attn_output + hidden_states = self._apply_gate(img_attn_output, hidden_states, img_gate1, timestep_zero_index) encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output del img_attn_output del txt_attn_output del img_gate1 del txt_gate1 - img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2) - hidden_states = torch.addcmul(hidden_states, img_gate2, self.img_mlp(img_modulated2)) + img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2, timestep_zero_index) + hidden_states = self._apply_gate(self.img_mlp(img_modulated2), hidden_states, img_gate2, timestep_zero_index) txt_modulated2, txt_gate2 = self._modulate(self.txt_norm2(encoder_hidden_states), txt_mod2) encoder_hidden_states = torch.addcmul(encoder_hidden_states, txt_gate2, self.txt_mlp(txt_modulated2)) @@ -391,11 +411,14 @@ class QwenImageTransformer2DModel(nn.Module): hidden_states, img_ids, orig_shape = self.process_img(x) num_embeds = hidden_states.shape[1] + timestep_zero_index = None if ref_latents is not None: h = 0 w = 0 index = 0 - index_ref_method = kwargs.get("ref_latents_method", "index") == "index" + ref_method = kwargs.get("ref_latents_method", "index") + index_ref_method = (ref_method == "index") or (ref_method == "index_timestep_zero") + timestep_zero = ref_method == "index_timestep_zero" for ref in ref_latents: if index_ref_method: index += 1 @@ -415,6 +438,10 @@ class QwenImageTransformer2DModel(nn.Module): kontext, kontext_ids, _ = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset) hidden_states = torch.cat([hidden_states, kontext], dim=1) img_ids = torch.cat([img_ids, kontext_ids], dim=1) + if timestep_zero: + if index > 0: + timestep = torch.cat([timestep, timestep * 0], dim=0) + timestep_zero_index = num_embeds txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2)) txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) @@ -446,7 +473,7 @@ class QwenImageTransformer2DModel(nn.Module): if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} - out["txt"], out["img"] = block(hidden_states=args["img"], encoder_hidden_states=args["txt"], encoder_hidden_states_mask=encoder_hidden_states_mask, temb=args["vec"], image_rotary_emb=args["pe"], transformer_options=args["transformer_options"]) + out["txt"], out["img"] = block(hidden_states=args["img"], encoder_hidden_states=args["txt"], encoder_hidden_states_mask=encoder_hidden_states_mask, temb=args["vec"], image_rotary_emb=args["pe"], timestep_zero_index=timestep_zero_index, transformer_options=args["transformer_options"]) return out out = blocks_replace[("double_block", i)]({"img": hidden_states, "txt": encoder_hidden_states, "vec": temb, "pe": image_rotary_emb, "transformer_options": transformer_options}, {"original_block": block_wrap}) hidden_states = out["img"] @@ -458,6 +485,7 @@ class QwenImageTransformer2DModel(nn.Module): encoder_hidden_states_mask=encoder_hidden_states_mask, temb=temb, image_rotary_emb=image_rotary_emb, + timestep_zero_index=timestep_zero_index, transformer_options=transformer_options, ) @@ -474,6 +502,9 @@ class QwenImageTransformer2DModel(nn.Module): if add is not None: hidden_states[:, :add.shape[1]] += add + if timestep_zero_index is not None: + temb = temb.chunk(2, dim=0)[0] + hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py index d9c4bba81..12c8ed3e6 100644 --- a/comfy_extras/nodes_flux.py +++ b/comfy_extras/nodes_flux.py @@ -154,12 +154,13 @@ class FluxKontextMultiReferenceLatentMethod(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="FluxKontextMultiReferenceLatentMethod", + display_name="Edit Model Reference Method", category="advanced/conditioning/flux", inputs=[ io.Conditioning.Input("conditioning"), io.Combo.Input( "reference_latents_method", - options=["offset", "index", "uxo/uno"], + options=["offset", "index", "uxo/uno", "index_timestep_zero"], ), ], outputs=[ From d02d0e5744f2e06fc40834d3c5bb387de4532007 Mon Sep 17 00:00:00 2001 From: seed93 Date: Tue, 16 Dec 2025 09:38:46 +0800 Subject: [PATCH 749/759] [add] tripo3.0 (#10663) * [add] tripo3.0 * [tripo] change paramter order * change order --------- Co-authored-by: liangd --- comfy_api_nodes/apis/tripo_api.py | 46 ++++++++++++++--- comfy_api_nodes/nodes_tripo.py | 86 ++++++++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 10 deletions(-) diff --git a/comfy_api_nodes/apis/tripo_api.py b/comfy_api_nodes/apis/tripo_api.py index 713260e2a..ffaaa7dc1 100644 --- a/comfy_api_nodes/apis/tripo_api.py +++ b/comfy_api_nodes/apis/tripo_api.py @@ -5,11 +5,17 @@ from typing import Optional, List, Dict, Any, Union from pydantic import BaseModel, Field, RootModel class TripoModelVersion(str, Enum): + v3_0_20250812 = 'v3.0-20250812' v2_5_20250123 = 'v2.5-20250123' v2_0_20240919 = 'v2.0-20240919' v1_4_20240625 = 'v1.4-20240625' +class TripoGeometryQuality(str, Enum): + standard = 'standard' + detailed = 'detailed' + + class TripoTextureQuality(str, Enum): standard = 'standard' detailed = 'detailed' @@ -61,14 +67,20 @@ class TripoSpec(str, Enum): class TripoAnimation(str, Enum): IDLE = "preset:idle" WALK = "preset:walk" + RUN = "preset:run" + DIVE = "preset:dive" CLIMB = "preset:climb" JUMP = "preset:jump" - RUN = "preset:run" SLASH = "preset:slash" SHOOT = "preset:shoot" HURT = "preset:hurt" FALL = "preset:fall" TURN = "preset:turn" + QUADRUPED_WALK = "preset:quadruped:walk" + HEXAPOD_WALK = "preset:hexapod:walk" + OCTOPOD_WALK = "preset:octopod:walk" + SERPENTINE_MARCH = "preset:serpentine:march" + AQUATIC_MARCH = "preset:aquatic:march" class TripoStylizeStyle(str, Enum): LEGO = "lego" @@ -105,6 +117,11 @@ class TripoTaskStatus(str, Enum): BANNED = "banned" EXPIRED = "expired" +class TripoFbxPreset(str, Enum): + BLENDER = "blender" + MIXAMO = "mixamo" + _3DSMAX = "3dsmax" + class TripoFileTokenReference(BaseModel): type: Optional[str] = Field(None, description='The type of the reference') file_token: str @@ -142,6 +159,7 @@ class TripoTextToModelRequest(BaseModel): model_seed: Optional[int] = Field(None, description='The seed for the model') texture_seed: Optional[int] = Field(None, description='The seed for the texture') texture_quality: Optional[TripoTextureQuality] = TripoTextureQuality.standard + geometry_quality: Optional[TripoGeometryQuality] = TripoGeometryQuality.standard style: Optional[TripoStyle] = None auto_size: Optional[bool] = Field(False, description='Whether to auto-size the model') quad: Optional[bool] = Field(False, description='Whether to apply quad to the generated model') @@ -156,6 +174,7 @@ class TripoImageToModelRequest(BaseModel): model_seed: Optional[int] = Field(None, description='The seed for the model') texture_seed: Optional[int] = Field(None, description='The seed for the texture') texture_quality: Optional[TripoTextureQuality] = TripoTextureQuality.standard + geometry_quality: Optional[TripoGeometryQuality] = TripoGeometryQuality.standard texture_alignment: Optional[TripoTextureAlignment] = Field(TripoTextureAlignment.ORIGINAL_IMAGE, description='The texture alignment method') style: Optional[TripoStyle] = Field(None, description='The style to apply to the generated model') auto_size: Optional[bool] = Field(False, description='Whether to auto-size the model') @@ -173,6 +192,7 @@ class TripoMultiviewToModelRequest(BaseModel): model_seed: Optional[int] = Field(None, description='The seed for the model') texture_seed: Optional[int] = Field(None, description='The seed for the texture') texture_quality: Optional[TripoTextureQuality] = TripoTextureQuality.standard + geometry_quality: Optional[TripoGeometryQuality] = TripoGeometryQuality.standard texture_alignment: Optional[TripoTextureAlignment] = TripoTextureAlignment.ORIGINAL_IMAGE auto_size: Optional[bool] = Field(False, description='Whether to auto-size the model') orientation: Optional[TripoOrientation] = Field(TripoOrientation.DEFAULT, description='The orientation for the model') @@ -219,14 +239,24 @@ class TripoConvertModelRequest(BaseModel): type: TripoTaskType = Field(TripoTaskType.CONVERT_MODEL, description='Type of task') format: TripoConvertFormat = Field(..., description='The format to convert to') original_model_task_id: str = Field(..., description='The task ID of the original model') - quad: Optional[bool] = Field(False, description='Whether to apply quad to the model') - force_symmetry: Optional[bool] = Field(False, description='Whether to force symmetry') - face_limit: Optional[int] = Field(10000, description='The number of faces to limit the conversion to') - flatten_bottom: Optional[bool] = Field(False, description='Whether to flatten the bottom of the model') - flatten_bottom_threshold: Optional[float] = Field(0.01, description='The threshold for flattening the bottom') - texture_size: Optional[int] = Field(4096, description='The size of the texture') + quad: Optional[bool] = Field(None, description='Whether to apply quad to the model') + force_symmetry: Optional[bool] = Field(None, description='Whether to force symmetry') + face_limit: Optional[int] = Field(None, description='The number of faces to limit the conversion to') + flatten_bottom: Optional[bool] = Field(None, description='Whether to flatten the bottom of the model') + flatten_bottom_threshold: Optional[float] = Field(None, description='The threshold for flattening the bottom') + texture_size: Optional[int] = Field(None, description='The size of the texture') texture_format: Optional[TripoTextureFormat] = Field(TripoTextureFormat.JPEG, description='The format of the texture') - pivot_to_center_bottom: Optional[bool] = Field(False, description='Whether to pivot to the center bottom') + pivot_to_center_bottom: Optional[bool] = Field(None, description='Whether to pivot to the center bottom') + scale_factor: Optional[float] = Field(None, description='The scale factor for the model') + with_animation: Optional[bool] = Field(None, description='Whether to include animations') + pack_uv: Optional[bool] = Field(None, description='Whether to pack the UVs') + bake: Optional[bool] = Field(None, description='Whether to bake the model') + part_names: Optional[List[str]] = Field(None, description='The names of the parts to include') + fbx_preset: Optional[TripoFbxPreset] = Field(None, description='The preset for the FBX export') + export_vertex_colors: Optional[bool] = Field(None, description='Whether to export the vertex colors') + export_orientation: Optional[TripoOrientation] = Field(None, description='The orientation for the export') + animate_in_place: Optional[bool] = Field(None, description='Whether to animate in place') + class TripoTaskRequest(RootModel): root: Union[ diff --git a/comfy_api_nodes/nodes_tripo.py b/comfy_api_nodes/nodes_tripo.py index 697100ff2..bd3c24fb3 100644 --- a/comfy_api_nodes/nodes_tripo.py +++ b/comfy_api_nodes/nodes_tripo.py @@ -102,8 +102,9 @@ class TripoTextToModelNode(IO.ComfyNode): IO.Int.Input("model_seed", default=42, optional=True), IO.Int.Input("texture_seed", default=42, optional=True), IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True), - IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True), + IO.Int.Input("face_limit", default=-1, min=-1, max=2000000, optional=True), IO.Boolean.Input("quad", default=False, optional=True), + IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True), ], outputs=[ IO.String.Output(display_name="model_file"), @@ -131,6 +132,7 @@ class TripoTextToModelNode(IO.ComfyNode): model_seed: Optional[int] = None, texture_seed: Optional[int] = None, texture_quality: Optional[str] = None, + geometry_quality: Optional[str] = None, face_limit: Optional[int] = None, quad: Optional[bool] = None, ) -> IO.NodeOutput: @@ -154,6 +156,7 @@ class TripoTextToModelNode(IO.ComfyNode): texture_seed=texture_seed, texture_quality=texture_quality, face_limit=face_limit, + geometry_quality=geometry_quality, auto_size=True, quad=quad, ), @@ -194,6 +197,7 @@ class TripoImageToModelNode(IO.ComfyNode): ), IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True), IO.Boolean.Input("quad", default=False, optional=True), + IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True), ], outputs=[ IO.String.Output(display_name="model_file"), @@ -220,6 +224,7 @@ class TripoImageToModelNode(IO.ComfyNode): orientation=None, texture_seed: Optional[int] = None, texture_quality: Optional[str] = None, + geometry_quality: Optional[str] = None, texture_alignment: Optional[str] = None, face_limit: Optional[int] = None, quad: Optional[bool] = None, @@ -246,6 +251,7 @@ class TripoImageToModelNode(IO.ComfyNode): pbr=pbr, model_seed=model_seed, orientation=orientation, + geometry_quality=geometry_quality, texture_alignment=texture_alignment, texture_seed=texture_seed, texture_quality=texture_quality, @@ -295,6 +301,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode): ), IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True), IO.Boolean.Input("quad", default=False, optional=True), + IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True), ], outputs=[ IO.String.Output(display_name="model_file"), @@ -323,6 +330,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode): model_seed: Optional[int] = None, texture_seed: Optional[int] = None, texture_quality: Optional[str] = None, + geometry_quality: Optional[str] = None, texture_alignment: Optional[str] = None, face_limit: Optional[int] = None, quad: Optional[bool] = None, @@ -359,6 +367,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode): model_seed=model_seed, texture_seed=texture_seed, texture_quality=texture_quality, + geometry_quality=geometry_quality, texture_alignment=texture_alignment, face_limit=face_limit, quad=quad, @@ -508,6 +517,8 @@ class TripoRetargetNode(IO.ComfyNode): options=[ "preset:idle", "preset:walk", + "preset:run", + "preset:dive", "preset:climb", "preset:jump", "preset:slash", @@ -515,6 +526,11 @@ class TripoRetargetNode(IO.ComfyNode): "preset:hurt", "preset:fall", "preset:turn", + "preset:quadruped:walk", + "preset:hexapod:walk", + "preset:octopod:walk", + "preset:serpentine:march", + "preset:aquatic:march" ], ), ], @@ -563,7 +579,7 @@ class TripoConversionNode(IO.ComfyNode): "face_limit", default=-1, min=-1, - max=500000, + max=2000000, optional=True, ), IO.Int.Input( @@ -579,6 +595,40 @@ class TripoConversionNode(IO.ComfyNode): default="JPEG", optional=True, ), + IO.Boolean.Input("force_symmetry", default=False, optional=True), + IO.Boolean.Input("flatten_bottom", default=False, optional=True), + IO.Float.Input( + "flatten_bottom_threshold", + default=0.0, + min=0.0, + max=1.0, + optional=True, + ), + IO.Boolean.Input("pivot_to_center_bottom", default=False, optional=True), + IO.Float.Input( + "scale_factor", + default=1.0, + min=0.0, + optional=True, + ), + IO.Boolean.Input("with_animation", default=False, optional=True), + IO.Boolean.Input("pack_uv", default=False, optional=True), + IO.Boolean.Input("bake", default=False, optional=True), + IO.String.Input("part_names", default="", optional=True), # comma-separated list + IO.Combo.Input( + "fbx_preset", + options=["blender", "mixamo", "3dsmax"], + default="blender", + optional=True, + ), + IO.Boolean.Input("export_vertex_colors", default=False, optional=True), + IO.Combo.Input( + "export_orientation", + options=["align_image", "default"], + default="default", + optional=True, + ), + IO.Boolean.Input("animate_in_place", default=False, optional=True), ], outputs=[], hidden=[ @@ -604,12 +654,31 @@ class TripoConversionNode(IO.ComfyNode): original_model_task_id, format: str, quad: bool, + force_symmetry: bool, face_limit: int, + flatten_bottom: bool, + flatten_bottom_threshold: float, texture_size: int, texture_format: str, + pivot_to_center_bottom: bool, + scale_factor: float, + with_animation: bool, + pack_uv: bool, + bake: bool, + part_names: str, + fbx_preset: str, + export_vertex_colors: bool, + export_orientation: str, + animate_in_place: bool, ) -> IO.NodeOutput: if not original_model_task_id: raise RuntimeError("original_model_task_id is required") + + # Parse part_names from comma-separated string to list + part_names_list = None + if part_names and part_names.strip(): + part_names_list = [name.strip() for name in part_names.split(',') if name.strip()] + response = await sync_op( cls, endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), @@ -618,9 +687,22 @@ class TripoConversionNode(IO.ComfyNode): original_model_task_id=original_model_task_id, format=format, quad=quad if quad else None, + force_symmetry=force_symmetry if force_symmetry else None, face_limit=face_limit if face_limit != -1 else None, + flatten_bottom=flatten_bottom if flatten_bottom else None, + flatten_bottom_threshold=flatten_bottom_threshold if flatten_bottom_threshold != 0.0 else None, texture_size=texture_size if texture_size != 4096 else None, texture_format=texture_format if texture_format != "JPEG" else None, + pivot_to_center_bottom=pivot_to_center_bottom if pivot_to_center_bottom else None, + scale_factor=scale_factor if scale_factor != 1.0 else None, + with_animation=with_animation if with_animation else None, + pack_uv=pack_uv if pack_uv else None, + bake=bake if bake else None, + part_names=part_names_list, + fbx_preset=fbx_preset if fbx_preset != "blender" else None, + export_vertex_colors=export_vertex_colors if export_vertex_colors else None, + export_orientation=export_orientation if export_orientation != "default" else None, + animate_in_place=animate_in_place if animate_in_place else None, ), ) return await poll_until_finished(cls, response, average_duration=30) From 41bcf0619db87d443d468c9ddad4454bdbc1b084 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 15 Dec 2025 17:51:06 -0800 Subject: [PATCH 750/759] Add code to detect if a z image fun controlnet is broken or not. (#11341) --- comfy_extras/nodes_model_patch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index ec0e790dc..fdd5d0d3f 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -248,7 +248,10 @@ class ModelPatchLoader: config['n_control_layers'] = 15 config['additional_in_dim'] = 17 config['refiner_control'] = True - config['broken'] = True + ref_weight = sd.get("control_noise_refiner.0.after_proj.weight", None) + if ref_weight is not None: + if torch.count_nonzero(ref_weight) == 0: + config['broken'] = True model = comfy.ldm.lumina.controlnet.ZImage_Control(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast, **config) model.load_state_dict(sd) From fc4af8606880be0374cf1f1f45bc5730e6d22bf5 Mon Sep 17 00:00:00 2001 From: Haoming <73768377+Haoming02@users.noreply.github.com> Date: Tue, 16 Dec 2025 09:57:28 +0800 Subject: [PATCH 751/759] [BlockInfo] Lumina (#11227) * block info * device * Make tensor int again --------- Co-authored-by: Jedrzej Kosinski --- comfy/ldm/lumina/model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index 96cb37fa6..5628e2ba3 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -634,8 +634,11 @@ class NextDiT(nn.Module): img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, transformer_options=transformer_options) freqs_cis = freqs_cis.to(img.device) + transformer_options["total_blocks"] = len(self.layers) + transformer_options["block_type"] = "double" img_input = img for i, layer in enumerate(self.layers): + transformer_options["block_index"] = i img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options) if "double_block" in patches: for p in patches["double_block"]: From ea2c117bc3c9d3b38d68e651905ed0d6dd682f92 Mon Sep 17 00:00:00 2001 From: Haoming <73768377+Haoming02@users.noreply.github.com> Date: Tue, 16 Dec 2025 09:59:16 +0800 Subject: [PATCH 752/759] [BlockInfo] Wan (#10845) * block info * animate * tensor * device * revert --- comfy/ldm/wan/model.py | 21 ++++++++++++++++++--- comfy/ldm/wan/model_animate.py | 3 +++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index a9d5e10d9..4216ce831 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -568,7 +568,10 @@ class WanModel(torch.nn.Module): patches_replace = transformer_options.get("patches_replace", {}) blocks_replace = patches_replace.get("dit", {}) + transformer_options["total_blocks"] = len(self.blocks) + transformer_options["block_type"] = "double" for i, block in enumerate(self.blocks): + transformer_options["block_index"] = i if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} @@ -763,7 +766,10 @@ class VaceWanModel(WanModel): patches_replace = transformer_options.get("patches_replace", {}) blocks_replace = patches_replace.get("dit", {}) + transformer_options["total_blocks"] = len(self.blocks) + transformer_options["block_type"] = "double" for i, block in enumerate(self.blocks): + transformer_options["block_index"] = i if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} @@ -862,7 +868,10 @@ class CameraWanModel(WanModel): patches_replace = transformer_options.get("patches_replace", {}) blocks_replace = patches_replace.get("dit", {}) + transformer_options["total_blocks"] = len(self.blocks) + transformer_options["block_type"] = "double" for i, block in enumerate(self.blocks): + transformer_options["block_index"] = i if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} @@ -1326,16 +1335,19 @@ class WanModel_S2V(WanModel): patches_replace = transformer_options.get("patches_replace", {}) blocks_replace = patches_replace.get("dit", {}) + transformer_options["total_blocks"] = len(self.blocks) + transformer_options["block_type"] = "double" for i, block in enumerate(self.blocks): + transformer_options["block_index"] = i if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} - out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"]) + out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], transformer_options=args["transformer_options"]) return out - out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap}) + out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap}) x = out["img"] else: - x = block(x, e=e0, freqs=freqs, context=context) + x = block(x, e=e0, freqs=freqs, context=context, transformer_options=transformer_options) if audio_emb is not None: x = self.audio_injector(x, i, audio_emb, audio_emb_global, seq_len) # head @@ -1574,7 +1586,10 @@ class HumoWanModel(WanModel): patches_replace = transformer_options.get("patches_replace", {}) blocks_replace = patches_replace.get("dit", {}) + transformer_options["total_blocks"] = len(self.blocks) + transformer_options["block_type"] = "double" for i, block in enumerate(self.blocks): + transformer_options["block_index"] = i if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} diff --git a/comfy/ldm/wan/model_animate.py b/comfy/ldm/wan/model_animate.py index 7c87835d4..84d7adec4 100644 --- a/comfy/ldm/wan/model_animate.py +++ b/comfy/ldm/wan/model_animate.py @@ -523,7 +523,10 @@ class AnimateWanModel(WanModel): patches_replace = transformer_options.get("patches_replace", {}) blocks_replace = patches_replace.get("dit", {}) + transformer_options["total_blocks"] = len(self.blocks) + transformer_options["block_type"] = "double" for i, block in enumerate(self.blocks): + transformer_options["block_index"] = i if ("double_block", i) in blocks_replace: def block_wrap(args): out = {} From 683569de5527379d9a095af88a9e1349fb7e46b5 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 15 Dec 2025 19:33:27 -0800 Subject: [PATCH 753/759] Only enable fp16 on ZImage on newer pytorch. (#11344) --- comfy/supported_models.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 834dfcffc..1888f35ba 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -28,6 +28,7 @@ from . import supported_models_base from . import latent_formats from . import diffusers_convert +import comfy.model_management class SD15(supported_models_base.BASE): unet_config = { @@ -1028,7 +1029,13 @@ class ZImage(Lumina2): memory_usage_factor = 2.0 - supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] + supported_inference_dtypes = [torch.bfloat16, torch.float32] + + def __init__(self, unet_config): + super().__init__(unet_config) + if comfy.model_management.extended_fp16_support(): + self.supported_inference_dtypes = self.supported_inference_dtypes.copy() + self.supported_inference_dtypes.insert(1, torch.float16) def clip_target(self, state_dict={}): pref = self.text_encoder_key_prefix[0] From 3d082c32065e0653490b9a4ae45dd33b6c7bffb7 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Mon, 15 Dec 2025 20:35:37 -0800 Subject: [PATCH 754/759] bump comfyui-frontend-package to 1.34.9 (patch) (#11342) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 117260515..9b9e61683 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.34.8 +comfyui-frontend-package==1.34.9 comfyui-workflow-templates==0.7.59 comfyui-embedded-docs==0.3.1 torch From 645ee1881e739b3013eeb26dbb335280bfbf443e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 15 Dec 2025 20:38:12 -0800 Subject: [PATCH 755/759] Inpainting for z image fun control. Use the ZImageFunControlnet node. (#11346) image -> control image ex: pose inpaint_image -> image for inpainting mask -> inpaint mask --- comfy_extras/nodes_model_patch.py | 77 ++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 16 deletions(-) diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index fdd5d0d3f..2a0cfcf18 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -313,22 +313,46 @@ class ZImageControlPatch: self.inpaint_image = inpaint_image self.mask = mask self.strength = strength - self.encoded_image = self.encode_latent_cond(image) - self.encoded_image_size = (image.shape[1], image.shape[2]) + self.is_inpaint = self.model_patch.model.additional_in_dim > 0 + + skip_encoding = False + if self.image is not None and self.inpaint_image is not None: + if self.image.shape != self.inpaint_image.shape: + skip_encoding = True + + if skip_encoding: + self.encoded_image = None + else: + self.encoded_image = self.encode_latent_cond(self.image, self.inpaint_image) + if self.image is None: + self.encoded_image_size = (self.inpaint_image.shape[1], self.inpaint_image.shape[2]) + else: + self.encoded_image_size = (self.image.shape[1], self.image.shape[2]) self.temp_data = None - def encode_latent_cond(self, control_image, inpaint_image=None): - latent_image = comfy.latent_formats.Flux().process_in(self.vae.encode(control_image)) - if self.model_patch.model.additional_in_dim > 0: - if self.mask is None: - mask_ = torch.zeros_like(latent_image)[:, :1] - else: - mask_ = comfy.utils.common_upscale(self.mask.mean(dim=1, keepdim=True), latent_image.shape[-1], latent_image.shape[-2], "bilinear", "none") + def encode_latent_cond(self, control_image=None, inpaint_image=None): + latent_image = None + if control_image is not None: + latent_image = comfy.latent_formats.Flux().process_in(self.vae.encode(control_image)) + + if self.is_inpaint: if inpaint_image is None: inpaint_image = torch.ones_like(control_image) * 0.5 + if self.mask is not None: + mask_inpaint = comfy.utils.common_upscale(self.mask.view(self.mask.shape[0], -1, self.mask.shape[-2], self.mask.shape[-1]).mean(dim=1, keepdim=True), inpaint_image.shape[-2], inpaint_image.shape[-3], "bilinear", "center") + inpaint_image = ((inpaint_image - 0.5) * mask_inpaint.movedim(1, -1).round()) + 0.5 + inpaint_image_latent = comfy.latent_formats.Flux().process_in(self.vae.encode(inpaint_image)) + if self.mask is None: + mask_ = torch.zeros_like(inpaint_image_latent)[:, :1] + else: + mask_ = comfy.utils.common_upscale(self.mask.view(self.mask.shape[0], -1, self.mask.shape[-2], self.mask.shape[-1]).mean(dim=1, keepdim=True), inpaint_image_latent.shape[-1], inpaint_image_latent.shape[-2], "nearest", "center") + + if latent_image is None: + latent_image = comfy.latent_formats.Flux().process_in(self.vae.encode(torch.ones_like(inpaint_image) * 0.5)) + return torch.cat([latent_image, mask_, inpaint_image_latent], dim=1) else: return latent_image @@ -344,13 +368,18 @@ class ZImageControlPatch: block_type = kwargs.get("block_type", "") spacial_compression = self.vae.spacial_compression_encode() if self.encoded_image is None or self.encoded_image_size != (x.shape[-2] * spacial_compression, x.shape[-1] * spacial_compression): - image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center") + image_scaled = None + if self.image is not None: + image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center").movedim(1, -1) + self.encoded_image_size = (image_scaled.shape[-3], image_scaled.shape[-2]) + inpaint_scaled = None if self.inpaint_image is not None: inpaint_scaled = comfy.utils.common_upscale(self.inpaint_image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center").movedim(1, -1) + self.encoded_image_size = (inpaint_scaled.shape[-3], inpaint_scaled.shape[-2]) + loaded_models = comfy.model_management.loaded_models(only_currently_used=True) - self.encoded_image = self.encode_latent_cond(image_scaled.movedim(1, -1), inpaint_scaled) - self.encoded_image_size = (image_scaled.shape[-2], image_scaled.shape[-1]) + self.encoded_image = self.encode_latent_cond(image_scaled, inpaint_scaled) comfy.model_management.load_models_gpu(loaded_models) cnet_blocks = self.model_patch.model.n_control_layers @@ -391,7 +420,8 @@ class ZImageControlPatch: def to(self, device_or_dtype): if isinstance(device_or_dtype, torch.device): - self.encoded_image = self.encoded_image.to(device_or_dtype) + if self.encoded_image is not None: + self.encoded_image = self.encoded_image.to(device_or_dtype) self.temp_data = None return self @@ -414,9 +444,12 @@ class QwenImageDiffsynthControlnet: CATEGORY = "advanced/loaders/qwen" - def diffsynth_controlnet(self, model, model_patch, vae, image, strength, mask=None): + def diffsynth_controlnet(self, model, model_patch, vae, image=None, strength=1.0, inpaint_image=None, mask=None): model_patched = model.clone() - image = image[:, :, :, :3] + if image is not None: + image = image[:, :, :, :3] + if inpaint_image is not None: + inpaint_image = inpaint_image[:, :, :, :3] if mask is not None: if mask.ndim == 3: mask = mask.unsqueeze(1) @@ -425,13 +458,24 @@ class QwenImageDiffsynthControlnet: mask = 1.0 - mask if isinstance(model_patch.model, comfy.ldm.lumina.controlnet.ZImage_Control): - patch = ZImageControlPatch(model_patch, vae, image, strength, mask=mask) + patch = ZImageControlPatch(model_patch, vae, image, strength, inpaint_image=inpaint_image, mask=mask) model_patched.set_model_noise_refiner_patch(patch) model_patched.set_model_double_block_patch(patch) else: model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength, mask)) return (model_patched,) +class ZImageFunControlnet(QwenImageDiffsynthControlnet): + @classmethod + def INPUT_TYPES(s): + return {"required": { "model": ("MODEL",), + "model_patch": ("MODEL_PATCH",), + "vae": ("VAE",), + "strength": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}), + }, + "optional": {"image": ("IMAGE",), "inpaint_image": ("IMAGE",), "mask": ("MASK",)}} + + CATEGORY = "advanced/loaders/zimage" class UsoStyleProjectorPatch: def __init__(self, model_patch, encoded_image): @@ -479,5 +523,6 @@ class USOStyleReference: NODE_CLASS_MAPPINGS = { "ModelPatchLoader": ModelPatchLoader, "QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet, + "ZImageFunControlnet": ZImageFunControlnet, "USOStyleReference": USOStyleReference, } From bc606d7d645f9edfcac7cca3558210d3ee391d94 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 15 Dec 2025 22:26:55 -0800 Subject: [PATCH 756/759] Add a way to set the default ref method in the qwen image code. (#11349) --- comfy/ldm/qwen_image/model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 96590088b..8481f7711 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -322,6 +322,7 @@ class QwenImageTransformer2DModel(nn.Module): pooled_projection_dim: int = 768, guidance_embeds: bool = False, axes_dims_rope: Tuple[int, int, int] = (16, 56, 56), + default_ref_method="index", image_model=None, final_layer=True, dtype=None, @@ -334,6 +335,7 @@ class QwenImageTransformer2DModel(nn.Module): self.in_channels = in_channels self.out_channels = out_channels or in_channels self.inner_dim = num_attention_heads * attention_head_dim + self.default_ref_method = default_ref_method self.pe_embedder = EmbedND(dim=attention_head_dim, theta=10000, axes_dim=list(axes_dims_rope)) @@ -416,7 +418,7 @@ class QwenImageTransformer2DModel(nn.Module): h = 0 w = 0 index = 0 - ref_method = kwargs.get("ref_latents_method", "index") + ref_method = kwargs.get("ref_latents_method", self.default_ref_method) index_ref_method = (ref_method == "index") or (ref_method == "index_timestep_zero") timestep_zero = ref_method == "index_timestep_zero" for ref in ref_latents: From 9304e47351be8d178a093b30bcaf5d72c3a2baf5 Mon Sep 17 00:00:00 2001 From: Benjamin Lu Date: Mon, 15 Dec 2025 23:24:18 -0800 Subject: [PATCH 757/759] Update workflows for new release process (#11064) * Update release workflows for branch process * Adjust branch order in workflow triggers * Revert changes in test workflows --- .github/workflows/test-ci.yml | 1 + .github/workflows/test-execution.yml | 4 ++-- .github/workflows/test-launch.yml | 4 ++-- .github/workflows/test-unit.yml | 4 ++-- .github/workflows/update-version.yml | 1 + 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test-ci.yml b/.github/workflows/test-ci.yml index 1660ec8e3..adfc5dd32 100644 --- a/.github/workflows/test-ci.yml +++ b/.github/workflows/test-ci.yml @@ -5,6 +5,7 @@ on: push: branches: - master + - release/** paths-ignore: - 'app/**' - 'input/**' diff --git a/.github/workflows/test-execution.yml b/.github/workflows/test-execution.yml index 00ef07ebf..9012633d8 100644 --- a/.github/workflows/test-execution.yml +++ b/.github/workflows/test-execution.yml @@ -2,9 +2,9 @@ name: Execution Tests on: push: - branches: [ main, master ] + branches: [ main, master, release/** ] pull_request: - branches: [ main, master ] + branches: [ main, master, release/** ] jobs: test: diff --git a/.github/workflows/test-launch.yml b/.github/workflows/test-launch.yml index 1735fd83b..fd70aff23 100644 --- a/.github/workflows/test-launch.yml +++ b/.github/workflows/test-launch.yml @@ -2,9 +2,9 @@ name: Test server launches without errors on: push: - branches: [ main, master ] + branches: [ main, master, release/** ] pull_request: - branches: [ main, master ] + branches: [ main, master, release/** ] jobs: test: diff --git a/.github/workflows/test-unit.yml b/.github/workflows/test-unit.yml index 00caf5b8a..d05179cd3 100644 --- a/.github/workflows/test-unit.yml +++ b/.github/workflows/test-unit.yml @@ -2,9 +2,9 @@ name: Unit Tests on: push: - branches: [ main, master ] + branches: [ main, master, release/** ] pull_request: - branches: [ main, master ] + branches: [ main, master, release/** ] jobs: test: diff --git a/.github/workflows/update-version.yml b/.github/workflows/update-version.yml index d9d488974..c2343cc39 100644 --- a/.github/workflows/update-version.yml +++ b/.github/workflows/update-version.yml @@ -6,6 +6,7 @@ on: - "pyproject.toml" branches: - master + - release/** jobs: update-version: From 65e2103b09f66e45438445fb0e99709ae7639869 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 16 Dec 2025 23:51:48 +0200 Subject: [PATCH 758/759] feat(api-nodes): add Wan2.6 model to video nodes (#11357) --- comfy_api_nodes/nodes_wan.py | 162 ++++++++++++++++++++--------------- 1 file changed, 95 insertions(+), 67 deletions(-) diff --git a/comfy_api_nodes/nodes_wan.py b/comfy_api_nodes/nodes_wan.py index 2aab3c2ff..17b680e13 100644 --- a/comfy_api_nodes/nodes_wan.py +++ b/comfy_api_nodes/nodes_wan.py @@ -1,7 +1,5 @@ import re -from typing import Optional -import torch from pydantic import BaseModel, Field from typing_extensions import override @@ -21,26 +19,26 @@ from comfy_api_nodes.util import ( class Text2ImageInputField(BaseModel): prompt: str = Field(...) - negative_prompt: Optional[str] = Field(None) + negative_prompt: str | None = Field(None) class Image2ImageInputField(BaseModel): prompt: str = Field(...) - negative_prompt: Optional[str] = Field(None) + negative_prompt: str | None = Field(None) images: list[str] = Field(..., min_length=1, max_length=2) class Text2VideoInputField(BaseModel): prompt: str = Field(...) - negative_prompt: Optional[str] = Field(None) - audio_url: Optional[str] = Field(None) + negative_prompt: str | None = Field(None) + audio_url: str | None = Field(None) class Image2VideoInputField(BaseModel): prompt: str = Field(...) - negative_prompt: Optional[str] = Field(None) + negative_prompt: str | None = Field(None) img_url: str = Field(...) - audio_url: Optional[str] = Field(None) + audio_url: str | None = Field(None) class Txt2ImageParametersField(BaseModel): @@ -52,7 +50,7 @@ class Txt2ImageParametersField(BaseModel): class Image2ImageParametersField(BaseModel): - size: Optional[str] = Field(None) + size: str | None = Field(None) n: int = Field(1, description="Number of images to generate.") # we support only value=1 seed: int = Field(..., ge=0, le=2147483647) watermark: bool = Field(True) @@ -61,19 +59,21 @@ class Image2ImageParametersField(BaseModel): class Text2VideoParametersField(BaseModel): size: str = Field(...) seed: int = Field(..., ge=0, le=2147483647) - duration: int = Field(5, ge=5, le=10) + duration: int = Field(5, ge=5, le=15) prompt_extend: bool = Field(True) watermark: bool = Field(True) - audio: bool = Field(False, description="Should be audio generated automatically") + audio: bool = Field(False, description="Whether to generate audio automatically.") + shot_type: str = Field("single") class Image2VideoParametersField(BaseModel): resolution: str = Field(...) seed: int = Field(..., ge=0, le=2147483647) - duration: int = Field(5, ge=5, le=10) + duration: int = Field(5, ge=5, le=15) prompt_extend: bool = Field(True) watermark: bool = Field(True) - audio: bool = Field(False, description="Should be audio generated automatically") + audio: bool = Field(False, description="Whether to generate audio automatically.") + shot_type: str = Field("single") class Text2ImageTaskCreationRequest(BaseModel): @@ -106,39 +106,39 @@ class TaskCreationOutputField(BaseModel): class TaskCreationResponse(BaseModel): - output: Optional[TaskCreationOutputField] = Field(None) + output: TaskCreationOutputField | None = Field(None) request_id: str = Field(...) - code: Optional[str] = Field(None, description="The error code of the failed request.") - message: Optional[str] = Field(None, description="Details of the failed request.") + code: str | None = Field(None, description="Error code for the failed request.") + message: str | None = Field(None, description="Details about the failed request.") class TaskResult(BaseModel): - url: Optional[str] = Field(None) - code: Optional[str] = Field(None) - message: Optional[str] = Field(None) + url: str | None = Field(None) + code: str | None = Field(None) + message: str | None = Field(None) class ImageTaskStatusOutputField(TaskCreationOutputField): task_id: str = Field(...) task_status: str = Field(...) - results: Optional[list[TaskResult]] = Field(None) + results: list[TaskResult] | None = Field(None) class VideoTaskStatusOutputField(TaskCreationOutputField): task_id: str = Field(...) task_status: str = Field(...) - video_url: Optional[str] = Field(None) - code: Optional[str] = Field(None) - message: Optional[str] = Field(None) + video_url: str | None = Field(None) + code: str | None = Field(None) + message: str | None = Field(None) class ImageTaskStatusResponse(BaseModel): - output: Optional[ImageTaskStatusOutputField] = Field(None) + output: ImageTaskStatusOutputField | None = Field(None) request_id: str = Field(...) class VideoTaskStatusResponse(BaseModel): - output: Optional[VideoTaskStatusOutputField] = Field(None) + output: VideoTaskStatusOutputField | None = Field(None) request_id: str = Field(...) @@ -152,7 +152,7 @@ class WanTextToImageApi(IO.ComfyNode): node_id="WanTextToImageApi", display_name="Wan Text to Image", category="api node/image/Wan", - description="Generates image based on text prompt.", + description="Generates an image based on a text prompt.", inputs=[ IO.Combo.Input( "model", @@ -164,13 +164,13 @@ class WanTextToImageApi(IO.ComfyNode): "prompt", multiline=True, default="", - tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", + tooltip="Prompt describing the elements and visual features. Supports English and Chinese.", ), IO.String.Input( "negative_prompt", multiline=True, default="", - tooltip="Negative text prompt to guide what to avoid.", + tooltip="Negative prompt describing what to avoid.", optional=True, ), IO.Int.Input( @@ -209,7 +209,7 @@ class WanTextToImageApi(IO.ComfyNode): IO.Boolean.Input( "watermark", default=True, - tooltip='Whether to add an "AI generated" watermark to the result.', + tooltip="Whether to add an AI-generated watermark to the result.", optional=True, ), ], @@ -252,7 +252,7 @@ class WanTextToImageApi(IO.ComfyNode): ), ) if not initial_response.output: - raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}") response = await poll_op( cls, ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), @@ -272,7 +272,7 @@ class WanImageToImageApi(IO.ComfyNode): display_name="Wan Image to Image", category="api node/image/Wan", description="Generates an image from one or two input images and a text prompt. " - "The output image is currently fixed at 1.6 MP; its aspect ratio matches the input image(s).", + "The output image is currently fixed at 1.6 MP, and its aspect ratio matches the input image(s).", inputs=[ IO.Combo.Input( "model", @@ -282,19 +282,19 @@ class WanImageToImageApi(IO.ComfyNode): ), IO.Image.Input( "image", - tooltip="Single-image editing or multi-image fusion, maximum 2 images.", + tooltip="Single-image editing or multi-image fusion. Maximum 2 images.", ), IO.String.Input( "prompt", multiline=True, default="", - tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", + tooltip="Prompt describing the elements and visual features. Supports English and Chinese.", ), IO.String.Input( "negative_prompt", multiline=True, default="", - tooltip="Negative text prompt to guide what to avoid.", + tooltip="Negative prompt describing what to avoid.", optional=True, ), # redo this later as an optional combo of recommended resolutions @@ -328,7 +328,7 @@ class WanImageToImageApi(IO.ComfyNode): IO.Boolean.Input( "watermark", default=True, - tooltip='Whether to add an "AI generated" watermark to the result.', + tooltip="Whether to add an AI-generated watermark to the result.", optional=True, ), ], @@ -347,7 +347,7 @@ class WanImageToImageApi(IO.ComfyNode): async def execute( cls, model: str, - image: torch.Tensor, + image: Input.Image, prompt: str, negative_prompt: str = "", # width: int = 1024, @@ -357,7 +357,7 @@ class WanImageToImageApi(IO.ComfyNode): ): n_images = get_number_of_images(image) if n_images not in (1, 2): - raise ValueError(f"Expected 1 or 2 input images, got {n_images}.") + raise ValueError(f"Expected 1 or 2 input images, but got {n_images}.") images = [] for i in image: images.append("data:image/png;base64," + tensor_to_base64_string(i, total_pixels=4096 * 4096)) @@ -376,7 +376,7 @@ class WanImageToImageApi(IO.ComfyNode): ), ) if not initial_response.output: - raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}") response = await poll_op( cls, ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), @@ -395,25 +395,25 @@ class WanTextToVideoApi(IO.ComfyNode): node_id="WanTextToVideoApi", display_name="Wan Text to Video", category="api node/video/Wan", - description="Generates video based on text prompt.", + description="Generates a video based on a text prompt.", inputs=[ IO.Combo.Input( "model", - options=["wan2.5-t2v-preview"], - default="wan2.5-t2v-preview", + options=["wan2.5-t2v-preview", "wan2.6-t2v"], + default="wan2.6-t2v", tooltip="Model to use.", ), IO.String.Input( "prompt", multiline=True, default="", - tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", + tooltip="Prompt describing the elements and visual features. Supports English and Chinese.", ), IO.String.Input( "negative_prompt", multiline=True, default="", - tooltip="Negative text prompt to guide what to avoid.", + tooltip="Negative prompt describing what to avoid.", optional=True, ), IO.Combo.Input( @@ -433,23 +433,23 @@ class WanTextToVideoApi(IO.ComfyNode): "1080p: 4:3 (1632x1248)", "1080p: 3:4 (1248x1632)", ], - default="480p: 1:1 (624x624)", + default="720p: 1:1 (960x960)", optional=True, ), IO.Int.Input( "duration", default=5, min=5, - max=10, + max=15, step=5, display_mode=IO.NumberDisplay.number, - tooltip="Available durations: 5 and 10 seconds", + tooltip="A 15-second duration is available only for the Wan 2.6 model.", optional=True, ), IO.Audio.Input( "audio", optional=True, - tooltip="Audio must contain a clear, loud voice, without extraneous noise, background music.", + tooltip="Audio must contain a clear, loud voice, without extraneous noise or background music.", ), IO.Int.Input( "seed", @@ -466,7 +466,7 @@ class WanTextToVideoApi(IO.ComfyNode): "generate_audio", default=False, optional=True, - tooltip="If there is no audio input, generate audio automatically.", + tooltip="If no audio input is provided, generate audio automatically.", ), IO.Boolean.Input( "prompt_extend", @@ -477,7 +477,15 @@ class WanTextToVideoApi(IO.ComfyNode): IO.Boolean.Input( "watermark", default=True, - tooltip='Whether to add an "AI generated" watermark to the result.', + tooltip="Whether to add an AI-generated watermark to the result.", + optional=True, + ), + IO.Combo.Input( + "shot_type", + options=["single", "multi"], + tooltip="Specifies the shot type for the generated video, that is, whether the video is a " + "single continuous shot or multiple shots with cuts. " + "This parameter takes effect only when prompt_extend is True.", optional=True, ), ], @@ -498,14 +506,19 @@ class WanTextToVideoApi(IO.ComfyNode): model: str, prompt: str, negative_prompt: str = "", - size: str = "480p: 1:1 (624x624)", + size: str = "720p: 1:1 (960x960)", duration: int = 5, - audio: Optional[Input.Audio] = None, + audio: Input.Audio | None = None, seed: int = 0, generate_audio: bool = False, prompt_extend: bool = True, watermark: bool = True, + shot_type: str = "single", ): + if "480p" in size and model == "wan2.6-t2v": + raise ValueError("The Wan 2.6 model does not support 480p.") + if duration == 15 and model == "wan2.5-t2v-preview": + raise ValueError("A 15-second duration is supported only by the Wan 2.6 model.") width, height = RES_IN_PARENS.search(size).groups() audio_url = None if audio is not None: @@ -526,11 +539,12 @@ class WanTextToVideoApi(IO.ComfyNode): audio=generate_audio, prompt_extend=prompt_extend, watermark=watermark, + shot_type=shot_type, ), ), ) if not initial_response.output: - raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}") response = await poll_op( cls, ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), @@ -549,12 +563,12 @@ class WanImageToVideoApi(IO.ComfyNode): node_id="WanImageToVideoApi", display_name="Wan Image to Video", category="api node/video/Wan", - description="Generates video based on the first frame and text prompt.", + description="Generates a video from the first frame and a text prompt.", inputs=[ IO.Combo.Input( "model", - options=["wan2.5-i2v-preview"], - default="wan2.5-i2v-preview", + options=["wan2.5-i2v-preview", "wan2.6-i2v"], + default="wan2.6-i2v", tooltip="Model to use.", ), IO.Image.Input( @@ -564,13 +578,13 @@ class WanImageToVideoApi(IO.ComfyNode): "prompt", multiline=True, default="", - tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", + tooltip="Prompt describing the elements and visual features. Supports English and Chinese.", ), IO.String.Input( "negative_prompt", multiline=True, default="", - tooltip="Negative text prompt to guide what to avoid.", + tooltip="Negative prompt describing what to avoid.", optional=True, ), IO.Combo.Input( @@ -580,23 +594,23 @@ class WanImageToVideoApi(IO.ComfyNode): "720P", "1080P", ], - default="480P", + default="720P", optional=True, ), IO.Int.Input( "duration", default=5, min=5, - max=10, + max=15, step=5, display_mode=IO.NumberDisplay.number, - tooltip="Available durations: 5 and 10 seconds", + tooltip="Duration 15 available only for WAN2.6 model.", optional=True, ), IO.Audio.Input( "audio", optional=True, - tooltip="Audio must contain a clear, loud voice, without extraneous noise, background music.", + tooltip="Audio must contain a clear, loud voice, without extraneous noise or background music.", ), IO.Int.Input( "seed", @@ -613,7 +627,7 @@ class WanImageToVideoApi(IO.ComfyNode): "generate_audio", default=False, optional=True, - tooltip="If there is no audio input, generate audio automatically.", + tooltip="If no audio input is provided, generate audio automatically.", ), IO.Boolean.Input( "prompt_extend", @@ -624,7 +638,15 @@ class WanImageToVideoApi(IO.ComfyNode): IO.Boolean.Input( "watermark", default=True, - tooltip='Whether to add an "AI generated" watermark to the result.', + tooltip="Whether to add an AI-generated watermark to the result.", + optional=True, + ), + IO.Combo.Input( + "shot_type", + options=["single", "multi"], + tooltip="Specifies the shot type for the generated video, that is, whether the video is a " + "single continuous shot or multiple shots with cuts. " + "This parameter takes effect only when prompt_extend is True.", optional=True, ), ], @@ -643,19 +665,24 @@ class WanImageToVideoApi(IO.ComfyNode): async def execute( cls, model: str, - image: torch.Tensor, + image: Input.Image, prompt: str, negative_prompt: str = "", - resolution: str = "480P", + resolution: str = "720P", duration: int = 5, - audio: Optional[Input.Audio] = None, + audio: Input.Audio | None = None, seed: int = 0, generate_audio: bool = False, prompt_extend: bool = True, watermark: bool = True, + shot_type: str = "single", ): if get_number_of_images(image) != 1: raise ValueError("Exactly one input image is required.") + if "480P" in resolution and model == "wan2.6-i2v": + raise ValueError("The Wan 2.6 model does not support 480P.") + if duration == 15 and model == "wan2.5-i2v-preview": + raise ValueError("A 15-second duration is supported only by the Wan 2.6 model.") image_url = "data:image/png;base64," + tensor_to_base64_string(image, total_pixels=2000 * 2000) audio_url = None if audio is not None: @@ -677,11 +704,12 @@ class WanImageToVideoApi(IO.ComfyNode): audio=generate_audio, prompt_extend=prompt_extend, watermark=watermark, + shot_type=shot_type, ), ), ) if not initial_response.output: - raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}") response = await poll_op( cls, ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), From ffdd53b327f7ebd48cf81a1c8b06d846cf354a66 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 16 Dec 2025 14:03:17 -0800 Subject: [PATCH 759/759] Check state dict key to auto enable the index_timestep_zero ref method. (#11362) --- comfy/ldm/qwen_image/model.py | 3 +++ comfy/model_detection.py | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 8481f7711..902af30ed 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -363,6 +363,9 @@ class QwenImageTransformer2DModel(nn.Module): for _ in range(num_layers) ]) + if self.default_ref_method == "index_timestep_zero": + self.register_buffer("__index_timestep_zero__", torch.tensor([])) + if final_layer: self.norm_out = LastLayer(self.inner_dim, self.inner_dim, dtype=dtype, device=device, operations=operations) self.proj_out = operations.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, dtype=dtype, device=device) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index dd6a703f6..7148c77fd 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -259,7 +259,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["nerf_tile_size"] = 512 dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear" dit_config["nerf_embedder_dtype"] = torch.float32 - if "__x0__" in state_dict_keys: # x0 pred + if "{}__x0__".format(key_prefix) in state_dict_keys: # x0 pred dit_config["use_x0"] = True else: dit_config["use_x0"] = False @@ -618,6 +618,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["image_model"] = "qwen_image" dit_config["in_channels"] = state_dict['{}img_in.weight'.format(key_prefix)].shape[1] dit_config["num_layers"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.') + if "{}__index_timestep_zero__".format(key_prefix) in state_dict_keys: # 2511 + dit_config["default_ref_method"] = "index_timestep_zero" return dit_config if '{}visual_transformer_blocks.0.cross_attention.key_norm.weight'.format(key_prefix) in state_dict_keys: # Kandinsky 5